diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -562,10 +562,16 @@ bool IsInSPMDExecutionMode = false) { CGBuilderTy &Bld = CGF.Builder; auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); - return IsInSPMDExecutionMode - ? RT.getGPUNumThreads(CGF) - : Bld.CreateNUWSub(RT.getGPUNumThreads(CGF), - RT.getGPUWarpSize(CGF), "thread_limit"); + llvm::Value *ThreadLimit = nullptr; + if (IsInSPMDExecutionMode) + ThreadLimit = RT.getGPUNumThreads(CGF); + else { + llvm::Value *GPUNumThreads = RT.getGPUNumThreads(CGF); + llvm::Value *GPUWarpSize = RT.getGPUWarpSize(CGF); + ThreadLimit = Bld.CreateNUWSub(GPUNumThreads, GPUWarpSize, "thread_limit"); + } + assert(ThreadLimit != nullptr && "Expected non-null ThreadLimit"); + return ThreadLimit; } /// Get the thread id of the OMP master thread. @@ -582,8 +588,8 @@ // We assume that the warp size is a power of 2. llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1)); - return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)), - Bld.CreateNot(Mask), "master_tid"); + llvm::Value *NumThreadsSubOne = Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)); + return Bld.CreateAnd(NumThreadsSubOne, Bld.CreateNot(Mask), "master_tid"); } CGOpenMPRuntimeGPU::WorkerFunctionState::WorkerFunctionState( @@ -1131,8 +1137,9 @@ EST.ExitBB = CGF.createBasicBlock(".exit"); auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); - llvm::Value *IsWorker = - Bld.CreateICmpULT(RT.getGPUThreadID(CGF), getThreadLimit(CGF)); + llvm::Value *GPUThreadID = RT.getGPUThreadID(CGF); + llvm::Value *ThreadLimit = getThreadLimit(CGF); + llvm::Value *IsWorker = Bld.CreateICmpULT(GPUThreadID, ThreadLimit); Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB); CGF.EmitBlock(WorkerBB); @@ -1140,8 +1147,9 @@ CGF.EmitBranch(EST.ExitBB); CGF.EmitBlock(MasterCheckBB); - llvm::Value *IsMaster = - Bld.CreateICmpEQ(RT.getGPUThreadID(CGF), getMasterThreadID(CGF)); + GPUThreadID = RT.getGPUThreadID(CGF); + llvm::Value *MasterThreadID = getMasterThreadID(CGF); + llvm::Value *IsMaster = Bld.CreateICmpEQ(GPUThreadID, MasterThreadID); Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB); CGF.EmitBlock(MasterBB); @@ -2073,105 +2081,41 @@ emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); } -void CGOpenMPRuntimeGPU::emitParallelCall( - CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, - ArrayRef CapturedVars, const Expr *IfCond) { +void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF, + SourceLocation Loc, + llvm::Function *OutlinedFn, + ArrayRef CapturedVars, + const Expr *IfCond) { if (!CGF.HaveInsertPoint()) return; - if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) - emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); - else - emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); -} - -void CGOpenMPRuntimeGPU::emitNonSPMDParallelCall( - CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, - ArrayRef CapturedVars, const Expr *IfCond) { - llvm::Function *Fn = cast(OutlinedFn); - - // Force inline this outlined function at its call site. - Fn->setLinkage(llvm::GlobalValue::InternalLinkage); - - Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, - /*Name=*/".zero.addr"); - CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); - // ThreadId for serialized parallels is 0. - Address ThreadIDAddr = ZeroAddr; - auto &&CodeGen = [this, Fn, CapturedVars, Loc, &ThreadIDAddr]( - CodeGenFunction &CGF, PrePostActionTy &Action) { - Action.Enter(CGF); - - Address ZeroAddr = - CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, - /*Name=*/".bound.zero.addr"); - CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); - llvm::SmallVector OutlinedFnArgs; - OutlinedFnArgs.push_back(ThreadIDAddr.getPointer()); - OutlinedFnArgs.push_back(ZeroAddr.getPointer()); - OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); - emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs); - }; - auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF, - PrePostActionTy &) { - - RegionCodeGenTy RCG(CodeGen); - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - llvm::Value *ThreadID = getThreadID(CGF, Loc); - llvm::Value *Args[] = {RTLoc, ThreadID}; - - NVPTXActionTy Action( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_serialized_parallel), - Args, - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_end_serialized_parallel), - Args); - RCG.setAction(Action); - RCG(CGF); - }; - - auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF, - PrePostActionTy &Action) { + auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, + IfCond](CodeGenFunction &CGF, PrePostActionTy &Action) { CGBuilderTy &Bld = CGF.Builder; - llvm::Function *WFn = WrapperFunctionsMap[Fn]; - assert(WFn && "Wrapper function does not exist!"); - llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); - - // Prepare for parallel region. Indicate the outlined function. - llvm::Value *Args[] = {ID}; - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_kernel_prepare_parallel), - Args); + llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn]; + llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy); + if (WFn) { + ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); + // Remember for post-processing in worker loop. + Work.emplace_back(WFn); + } + llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy); // Create a private scope that will globalize the arguments // passed from the outside of the target region. + // TODO: Is that needed? CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF); + Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca( + llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()), + "captured_vars_addrs"); // There's something to share. if (!CapturedVars.empty()) { // Prepare for parallel region. Indicate the outlined function. - Address SharedArgs = - CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs"); - llvm::Value *SharedArgsPtr = SharedArgs.getPointer(); - - llvm::Value *DataSharingArgs[] = { - SharedArgsPtr, - llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())}; - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_begin_sharing_variables), - DataSharingArgs); - - // Store variable address in a list of references to pass to workers. - unsigned Idx = 0; ASTContext &Ctx = CGF.getContext(); - Address SharedArgListAddress = CGF.EmitLoadOfPointer( - SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy)) - .castAs()); + unsigned Idx = 0; for (llvm::Value *V : CapturedVars) { - Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); + Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx); llvm::Value *PtrV; if (V->getType()->isIntegerTy()) PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy); @@ -2183,141 +2127,33 @@ } } - // Activate workers. This barrier is used by the master to signal - // work for the workers. - syncCTAThreads(CGF); - - // OpenMP [2.5, Parallel Construct, p.49] - // There is an implied barrier at the end of a parallel region. After the - // end of a parallel region, only the master thread of the team resumes - // execution of the enclosing task region. - // - // The master waits at this barrier until all workers are done. - syncCTAThreads(CGF); - - if (!CapturedVars.empty()) - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_end_sharing_variables)); - - // Remember for post-processing in worker loop. - Work.emplace_back(WFn); - }; - - auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen]( - CodeGenFunction &CGF, PrePostActionTy &Action) { - if (IsInParallelRegion) { - SeqGen(CGF, Action); - } else if (IsInTargetMasterThreadRegion) { - L0ParallelGen(CGF, Action); - } else { - // Check for master and then parallelism: - // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) { - // Serialized execution. - // } else { - // Worker call. - // } - CGBuilderTy &Bld = CGF.Builder; - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential"); - llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck"); - llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); - llvm::Value *IsSPMD = Bld.CreateIsNotNull( - CGF.EmitNounwindRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_is_spmd_exec_mode))); - Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(ParallelCheckBB); - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - llvm::Value *ThreadID = getThreadID(CGF, Loc); - llvm::Value *PL = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), - OMPRTL___kmpc_parallel_level), - {RTLoc, ThreadID}); - llvm::Value *Res = Bld.CreateIsNotNull(PL); - Bld.CreateCondBr(Res, SeqBB, MasterBB); - CGF.EmitBlock(SeqBB); - SeqGen(CGF, Action); - CGF.EmitBranch(ExitBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(MasterBB); - L0ParallelGen(CGF, Action); - CGF.EmitBranch(ExitBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - // Emit the continuation block for code after the if. - CGF.EmitBlock(ExitBB, /*IsFinished=*/true); - } - }; - - if (IfCond) { - emitIfClause(CGF, IfCond, LNParallelGen, SeqGen); - } else { - CodeGenFunction::RunCleanupsScope Scope(CGF); - RegionCodeGenTy ThenRCG(LNParallelGen); - ThenRCG(CGF); - } -} - -void CGOpenMPRuntimeGPU::emitSPMDParallelCall( - CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, - ArrayRef CapturedVars, const Expr *IfCond) { - // Just call the outlined function to execute the parallel region. - // OutlinedFn(>id, &zero, CapturedStruct); - // - llvm::SmallVector OutlinedFnArgs; - - Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, - /*Name=*/".zero.addr"); - CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); - // ThreadId for serialized parallels is 0. - Address ThreadIDAddr = ZeroAddr; - auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, &ThreadIDAddr]( - CodeGenFunction &CGF, PrePostActionTy &Action) { - Action.Enter(CGF); - - Address ZeroAddr = - CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, - /*Name=*/".bound.zero.addr"); - CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); - llvm::SmallVector OutlinedFnArgs; - OutlinedFnArgs.push_back(ThreadIDAddr.getPointer()); - OutlinedFnArgs.push_back(ZeroAddr.getPointer()); - OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); - emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); - }; - auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF, - PrePostActionTy &) { + llvm::Value *IfCondVal = nullptr; + if (IfCond) + IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty, + /* isSigned */ false); + else + IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1); - RegionCodeGenTy RCG(CodeGen); + assert(IfCondVal && "Expected a value"); llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - llvm::Value *ThreadID = getThreadID(CGF, Loc); - llvm::Value *Args[] = {RTLoc, ThreadID}; - - NVPTXActionTy Action( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_serialized_parallel), - Args, - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_end_serialized_parallel), - Args); - RCG.setAction(Action); - RCG(CGF); + llvm::Value *Args[] = { + RTLoc, + getThreadID(CGF, Loc), + IfCondVal, + llvm::ConstantInt::get(CGF.Int32Ty, -1), + llvm::ConstantInt::get(CGF.Int32Ty, -1), + FnPtr, + ID, + Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(), + CGF.VoidPtrPtrTy), + llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())}; + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_parallel_51), + Args); }; - if (IsInTargetMasterThreadRegion) { - // In the worker need to use the real thread id. - ThreadIDAddr = emitThreadIDAddress(CGF, Loc); - RegionCodeGenTy RCG(CodeGen); - RCG(CGF); - } else { - // If we are not in the target region, it is definitely L2 parallelism or - // more, because for SPMD mode we always has L1 parallel level, sowe don't - // need to check for orphaned directives. - RegionCodeGenTy RCG(SeqGen); - RCG(CGF); - } + RegionCodeGenTy RCG(ParallelGen); + RCG(CGF); } void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) { diff --git a/clang/test/OpenMP/nvptx_allocate_codegen.cpp b/clang/test/OpenMP/nvptx_allocate_codegen.cpp --- a/clang/test/OpenMP/nvptx_allocate_codegen.cpp +++ b/clang/test/OpenMP/nvptx_allocate_codegen.cpp @@ -1,3 +1,4 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-apple-darwin10.6.0 -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc -o %t-host.bc %s // RUN: %clang_cc1 -verify -fopenmp -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-host.bc -o - -disable-llvm-optzns | FileCheck %s // expected-no-diagnostics @@ -17,16 +18,6 @@ extern const omp_allocator_handle_t omp_pteam_mem_alloc; extern const omp_allocator_handle_t omp_thread_mem_alloc; -// CHECK-DAG: @{{.+}}St1{{.+}}b{{.+}} = external global i32, -// CHECK-DAG: @a ={{ hidden | }}global i32 0, -// CHECK-DAG: @b ={{ hidden | }}addrspace(4) global i32 0, -// CHECK-DAG: @c ={{ hidden | }}global i32 0, -// CHECK-DAG: @d ={{ hidden | }}global %struct.St1 zeroinitializer, -// CHECK-DAG: @{{.+}}ns{{.+}}a{{.+}} ={{ hidden | }}addrspace(3) global i32 0, -// CHECK-DAG: @{{.+}}main{{.+}}a{{.*}} = internal global i32 0, -// CHECK-DAG: @{{.+}}ST{{.+}}m{{.+}} = external global i32, -// CHECK-DAG: @bar_c = internal global i32 0, -// CHECK-DAG: @bar_b = internal addrspace(3) global double 0.000000e+00, struct St{ int a; }; @@ -60,9 +51,7 @@ } #pragma omp allocate(ns::a) allocator(omp_pteam_mem_alloc) -// CHECK-LABEL: @main int main () { - // CHECK: alloca double, static int a; #pragma omp allocate(a) allocator(omp_thread_mem_alloc) a=2; @@ -73,22 +62,16 @@ return (foo()); } -// CHECK: define {{.*}}i32 @{{.+}}foo{{.+}}() -// CHECK-NOT: alloca i32, extern template int ST::m; void baz(float &); -// CHECK: define{{ hidden | }}void @{{.+}}bar{{.+}}() void bar() { - // CHECK: alloca float, float bar_a; - // CHECK: alloca double, double bar_b; int bar_c; #pragma omp allocate(bar_c) allocator(omp_cgroup_mem_alloc) - // CHECK: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}) #pragma omp parallel private(bar_a, bar_b) allocate(omp_thread_mem_alloc \ : bar_a) allocate(omp_pteam_mem_alloc \ : bar_b) @@ -96,13 +79,69 @@ bar_b = bar_a; baz(bar_a); } -// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}) -// CHECK-NOT: alloca double, -// CHECK: alloca float, -// CHECK-NOT: alloca double, -// CHECK: load float, float* % -// CHECK: store double {{.+}}, double* addrspacecast (double addrspace(3)* @bar_b to double*), } #pragma omp end declare target #endif +// CHECK-LABEL: define {{[^@]+}}@main +// CHECK-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B:%.*]] = alloca double, align 8 +// CHECK-NEXT: store i32 0, i32* [[RETVAL]], align 4 +// CHECK-NEXT: store i32 2, i32* @_ZZ4mainE1a, align 4 +// CHECK-NEXT: store double 3.000000e+00, double* [[B]], align 8 +// CHECK-NEXT: [[CALL:%.*]] = call i32 @_Z3fooIiET_v() #[[ATTR6:[0-9]+]] +// CHECK-NEXT: ret i32 [[CALL]] +// +// +// CHECK-LABEL: define {{[^@]+}}@_Z3fooIiET_v +// CHECK-SAME: () #[[ATTR1:[0-9]+]] comdat { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @_ZN2STIiE1mE, align 4 +// CHECK-NEXT: store i32 [[TMP0]], i32* @v, align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @v, align 4 +// CHECK-NEXT: ret i32 [[TMP1]] +// +// +// CHECK-LABEL: define {{[^@]+}}@_Z3barv +// CHECK-SAME: () #[[ATTR1]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[BAR_A:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[BAR_B:%.*]] = alloca double, align 8 +// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP1]], i64 0) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[BAR_A:%.*]] = alloca float, align 4 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[BAR_A]], align 4 +// CHECK-NEXT: [[CONV:%.*]] = fpext float [[TMP0]] to double +// CHECK-NEXT: store double [[CONV]], double* addrspacecast (double addrspace(3)* @bar_b to double*), align 8 +// CHECK-NEXT: call void @_Z3bazRf(float* nonnull align 4 dereferenceable(4) [[BAR_A]]) #[[ATTR6]] +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR5:[0-9]+]] +// CHECK-NEXT: ret void +// diff --git a/clang/test/OpenMP/nvptx_data_sharing.cpp b/clang/test/OpenMP/nvptx_data_sharing.cpp --- a/clang/test/OpenMP/nvptx_data_sharing.cpp +++ b/clang/test/OpenMP/nvptx_data_sharing.cpp @@ -1,9 +1,10 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" // Test device global memory data sharing codegen. ///==========================================================================/// // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CK1 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK1 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK1 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK2 // expected-no-diagnostics @@ -27,93 +28,386 @@ } } } -// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] } -// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef -// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef -// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i64 8 -// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 - -/// ========= In the worker function ========= /// -// CK1: {{.*}}define internal void @__omp_offloading{{.*}}test_ds{{.*}}_worker() -// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CK1-NOT: call void @__kmpc_data_sharing_init_stack - -/// ========= In the kernel function ========= /// - -// CK1: {{.*}}define weak void @__omp_offloading{{.*}}test_ds{{.*}}() -// CK1: [[SHAREDARGS1:%.+]] = alloca i8** -// CK1: [[SHAREDARGS2:%.+]] = alloca i8** -// CK1: call void @__kmpc_kernel_init -// CK1: call void @__kmpc_data_sharing_init_stack -// SEQ: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: [[SIZE:%.+]] = load i64, i64* [[KERNEL_SIZE]], -// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 [[SIZE]], i16 [[SHARED_MEM_FLAG]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i64 0 -// PAR: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 8, i16 1) -// CK1: [[GLOBALSTACK2:%.+]] = bitcast i8* [[GLOBALSTACK]] to %struct._globalized_locals_ty* -// CK1: [[A:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 0 -// CK1: [[B:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 1 -// CK1: store i32 10, i32* [[A]] -// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}) -// CK1: call void @__kmpc_begin_sharing_variables(i8*** [[SHAREDARGS1]], i64 1) -// CK1: [[SHARGSTMP1:%.+]] = load i8**, i8*** [[SHAREDARGS1]] -// CK1: [[SHARGSTMP2:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP1]], i64 0 -// CK1: [[SHAREDVAR:%.+]] = bitcast i32* [[A]] to i8* -// CK1: store i8* [[SHAREDVAR]], i8** [[SHARGSTMP2]] -// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CK1: call void @__kmpc_end_sharing_variables() -// CK1: store i32 100, i32* [[B]] -// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}) -// CK1: call void @__kmpc_begin_sharing_variables(i8*** [[SHAREDARGS2]], i64 2) -// CK1: [[SHARGSTMP3:%.+]] = load i8**, i8*** [[SHAREDARGS2]] -// CK1: [[SHARGSTMP4:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP3]], i64 0 -// CK1: [[SHAREDVAR1:%.+]] = bitcast i32* [[B]] to i8* -// CK1: store i8* [[SHAREDVAR1]], i8** [[SHARGSTMP4]] -// CK1: [[SHARGSTMP12:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP3]], i64 1 -// CK1: [[SHAREDVAR2:%.+]] = bitcast i32* [[A]] to i8* -// CK1: store i8* [[SHAREDVAR2]], i8** [[SHARGSTMP12]] -// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CK1: call void @__kmpc_end_sharing_variables() -// SEQ: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[SHARED_MEM_FLAG]]) -// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[GLOBALSTACK]]) -// CK1: call void @__kmpc_kernel_deinit(i16 1) - -/// ========= In the data sharing wrapper function ========= /// - -// CK1: {{.*}}define internal void @__omp_outlined{{.*}}wrapper({{.*}}) -// CK1: [[SHAREDARGS4:%.+]] = alloca i8** -// CK1: call void @__kmpc_get_shared_variables(i8*** [[SHAREDARGS4]]) -// CK1: [[SHARGSTMP13:%.+]] = load i8**, i8*** [[SHAREDARGS4]] -// CK1: [[SHARGSTMP14:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP13]], i64 0 -// CK1: [[SHARGSTMP15:%.+]] = bitcast i8** [[SHARGSTMP14]] to i32** -// CK1: [[SHARGSTMP16:%.+]] = load i32*, i32** [[SHARGSTMP15]] -// CK1: call void @__omp_outlined__{{.*}}({{.*}}, i32* [[SHARGSTMP16]]) - -/// outlined function for the second parallel region /// - -// CK1: define internal void @{{.+}}(i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* nonnull align {{[0-9]+}} dereferenceable{{.+}}, i32* nonnull align {{[0-9]+}} dereferenceable{{.+}}) -// CK1-NOT: call i8* @__kmpc_data_sharing_push_stack( -// CK1: [[C_ADDR:%.+]] = alloca i32, -// CK1: store i32* [[C_ADDR]], i32** % -// CK1i-NOT: call void @__kmpc_data_sharing_pop_stack( - -/// ========= In the data sharing wrapper function ========= /// - -// CK1: {{.*}}define internal void @__omp_outlined{{.*}}wrapper({{.*}}) -// CK1: [[SHAREDARGS3:%.+]] = alloca i8** -// CK1: call void @__kmpc_get_shared_variables(i8*** [[SHAREDARGS3]]) -// CK1: [[SHARGSTMP5:%.+]] = load i8**, i8*** [[SHAREDARGS3]] -// CK1: [[SHARGSTMP6:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP5]], i64 0 -// CK1: [[SHARGSTMP7:%.+]] = bitcast i8** [[SHARGSTMP6]] to i32** -// CK1: [[SHARGSTMP8:%.+]] = load i32*, i32** [[SHARGSTMP7]] -// CK1: [[SHARGSTMP9:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP5]], i64 1 -// CK1: [[SHARGSTMP10:%.+]] = bitcast i8** [[SHARGSTMP9]] to i32** -// CK1: [[SHARGSTMP11:%.+]] = load i32*, i32** [[SHARGSTMP10]] -// CK1: call void @__omp_outlined__{{.*}}({{.*}}, i32* [[SHARGSTMP8]], i32* [[SHARGSTMP11]]) #endif +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l15_worker +// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK1: .execute.fn: +// CHECK1-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .check.next: +// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CHECK1: .execute.fn2: +// CHECK1-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .check.next3: +// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP7]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l15 +// CHECK1-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: [[C:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l15_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK1-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 +// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty* +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0 +// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: store i32 10, i32* [[A]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i32* [[A]] to i8* +// CHECK1-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP13]], i64 1) +// CHECK1-NEXT: store i32 100, i32* [[B]], align 4 +// CHECK1-NEXT: store i32 1000, i32* [[C]], align 4 +// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS7]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP15:%.*]] = bitcast i32* [[B]] to i8* +// CHECK1-NEXT: store i8* [[TMP15]], i8** [[TMP14]], align 8 +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS7]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP17:%.*]] = bitcast i32* [[A]] to i8* +// CHECK1-NEXT: store i8* [[TMP17]], i8** [[TMP16]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP18]], i64 2) +// CHECK1-NEXT: [[TMP19:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP19]]) +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i32 1000, i32* [[TMP0]], align 4 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[C:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[C1:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[B]], i32** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[C]], i32** [[C1]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 10000 +// CHECK1-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 1 +// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32** +// CHECK1-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8 +// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l15_worker +// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK2: .execute.fn: +// CHECK2-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .check.next: +// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CHECK2: .execute.fn2: +// CHECK2-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .check.next3: +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP7]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l15 +// CHECK2-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK2-NEXT: [[C:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [2 x i8*], align 8 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l15_worker() #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1) +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP7:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: store i32 10, i32* [[A]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast i32* [[A]] to i8* +// CHECK2-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 8 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP10]], i64 1) +// CHECK2-NEXT: store i32 100, i32* [[B]], align 4 +// CHECK2-NEXT: store i32 1000, i32* [[C]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS7]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i32* [[B]] to i8* +// CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS7]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i32* [[A]] to i8* +// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK2-NEXT: [[TMP15:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP15]], i64 2) +// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i32 1000, i32* [[TMP0]], align 4 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[C:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[C1:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[B]], i32** [[B_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[C]], i32** [[C1]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 10000 +// CHECK2-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 1 +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32** +// CHECK2-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8 +// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// diff --git a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp --- a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp +++ b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp @@ -1,21 +1,22 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK1 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK2 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK3 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK4 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK5 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK6 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK7 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK8 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK9 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK10 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK11 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK12 // expected-no-diagnostics #ifndef HEADER @@ -34,36 +35,3989 @@ return 0; } -// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] } -// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef -// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef -// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 40 -// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 -// CHECK-DAG: @__omp_offloading_{{.*}}_main_[[LINE:l.+]]_exec_mode = weak constant i8 0 - -// CHECK: define weak void @__omp_offloading_{{.*}}_main_[[LINE]]([10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}) -// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], -// SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ: [[GEP:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0 -// PAR: [[GEP:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 40, i16 1) -// CHECK: [[STACK:%.+]] = bitcast i8* [[GEP]] to %struct._globalized_locals_ty* -// CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK-NOT: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], -// CHECK: call void @__kmpc_for_static_init_4( - -// CHECK: call void [[PARALLEL:@.+]]( - -// CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @ - -// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]]) -// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[GEP]]) - -// CHECK: define internal void [[PARALLEL]]( -// CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack( - -// CHECK-NOT: call void @__kmpc_data_sharing_pop_stack( - #endif +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 +// CHECK1-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[ARGC_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK1-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[ARGC_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i64 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK1-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 +// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP5]], i16 [[TMP4]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i64 0 +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct._globalized_locals_ty* +// CHECK1-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP8]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP10]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP11]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP13]], i8* align 4 [[TMP14]], i64 40, i1 false) +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[TMP15:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP16]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] +// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP19]], [[COND_TRUE]] ], [ [[TMP20]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1 +// CHECK1-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP22]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i64 +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP27:%.*]] = zext i32 [[TMP26]] to i64 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP25]] to i8* +// CHECK1-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP27]] to i8* +// CHECK1-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP33:%.*]] = bitcast i32* [[CONV]] to i8* +// CHECK1-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 8 +// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP35:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK1-NEXT: store i8* [[TMP35]], i8** [[TMP34]], align 8 +// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK1-NEXT: [[TMP37:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK1-NEXT: store i8* [[TMP37]], i8** [[TMP36]], align 8 +// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 5 +// CHECK1-NEXT: [[TMP39:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK1-NEXT: store i8* [[TMP39]], i8** [[TMP38]], align 8 +// CHECK1-NEXT: [[TMP40:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 6 +// CHECK1-NEXT: [[TMP41:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK1-NEXT: store i8* [[TMP41]], i8** [[TMP40]], align 8 +// CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK1-NEXT: [[TMP43:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK1-NEXT: [[TMP44:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 +// CHECK1-NEXT: [[TMP46:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP45]], i32 [[TMP43]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP46]], i64 7) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP47]], [[TMP48]] +// CHECK1-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP49]], [[TMP50]] +// CHECK1-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP51]], [[TMP52]] +// CHECK1-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]] +// CHECK1-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK1: cond.true12: +// CHECK1-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: br label [[COND_END14:%.*]] +// CHECK1: cond.false13: +// CHECK1-NEXT: [[TMP56:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END14]] +// CHECK1: cond.end14: +// CHECK1-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP55]], [[COND_TRUE12]] ], [ [[TMP56]], [[COND_FALSE13]] ] +// CHECK1-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP57:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP57]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP58:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP59:%.*]] = load i32, i32* [[TMP58]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP59]]) +// CHECK1-NEXT: [[TMP60:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP61:%.*]] = icmp ne i32 [[TMP60]], 0 +// CHECK1-NEXT: br i1 [[TMP61]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK1: .omp.lastprivate.then: +// CHECK1-NEXT: [[TMP62:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK1-NEXT: [[TMP63:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP62]], i8* align 4 [[TMP63]], i64 40, i1 false) +// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK1: .omp.lastprivate.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: [[TMP64:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP64]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK1-NEXT: [[C5:%.*]] = alloca [10 x i32], align 4 +// CHECK1-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP9]] to i32 +// CHECK1-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP10]] to i32 +// CHECK1-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i64 40, i1 false) +// CHECK1-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP8:%.*]] = icmp ule i64 [[CONV7]], [[TMP17]] +// CHECK1-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I6]]) #[[ATTR5:[0-9]+]] +// CHECK1-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[CALL]], [[CALL9]] +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B4]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[CALL11:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD10]], [[CALL11]] +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK1-NEXT: [[IDXPROM13:%.*]] = sext i32 [[TMP20]] to i64 +// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C5]], i64 0, i64 [[IDXPROM13]] +// CHECK1-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK1-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD12]], [[CALL15]] +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK1-NEXT: [[IDXPROM17:%.*]] = sext i32 [[TMP21]] to i64 +// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i64 0, i64 [[IDXPROM17]] +// CHECK1-NEXT: [[CALL19:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX18]]) #[[ATTR5]] +// CHECK1-NEXT: [[ADD20:%.*]] = add nsw i32 [[ADD16]], [[CALL19]] +// CHECK1-NEXT: store i32 [[ADD20]], i32* [[TMP1]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD21:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD21]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK1-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK1: .omp.lastprivate.then: +// CHECK1-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK1-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C5]] to i8* +// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i64 40, i1 false) +// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK1: .omp.lastprivate.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 +// CHECK2-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[ARGC_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK2-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK2-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[ARGC_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[CONV1]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i64 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK2-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK2-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 40, i16 1) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK2-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i64 40, i1 false) +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] +// CHECK2-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = zext i32 [[TMP23]] to i64 +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP22]] to i8* +// CHECK2-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP24]] to i8* +// CHECK2-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP30:%.*]] = bitcast i32* [[CONV]] to i8* +// CHECK2-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK2-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK2-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 +// CHECK2-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK2-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK2-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 +// CHECK2-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 5 +// CHECK2-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK2-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 +// CHECK2-NEXT: [[TMP37:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 6 +// CHECK2-NEXT: [[TMP38:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK2-NEXT: store i8* [[TMP38]], i8** [[TMP37]], align 8 +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK2-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP39]], 0 +// CHECK2-NEXT: [[TMP40:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[TMP41]], align 4 +// CHECK2-NEXT: [[TMP43:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP42]], i32 [[TMP40]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP43]], i64 7) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] +// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP48]], [[TMP49]] +// CHECK2-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]] +// CHECK2-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK2: cond.true12: +// CHECK2-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: br label [[COND_END14:%.*]] +// CHECK2: cond.false13: +// CHECK2-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END14]] +// CHECK2: cond.end14: +// CHECK2-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP52]], [[COND_TRUE12]] ], [ [[TMP53]], [[COND_FALSE13]] ] +// CHECK2-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP54]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP55:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP56:%.*]] = load i32, i32* [[TMP55]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP56]]) +// CHECK2-NEXT: [[TMP57:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP58:%.*]] = icmp ne i32 [[TMP57]], 0 +// CHECK2-NEXT: br i1 [[TMP58]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK2: .omp.lastprivate.then: +// CHECK2-NEXT: [[TMP59:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK2-NEXT: [[TMP60:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP59]], i8* align 4 [[TMP60]], i64 40, i1 false) +// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK2: .omp.lastprivate.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK2-NEXT: [[C5:%.*]] = alloca [10 x i32], align 4 +// CHECK2-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK2-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = trunc i64 [[TMP9]] to i32 +// CHECK2-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP10]] to i32 +// CHECK2-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i64 40, i1 false) +// CHECK2-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CONV7:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK2-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP8:%.*]] = icmp ule i64 [[CONV7]], [[TMP17]] +// CHECK2-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK2-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I6]]) #[[ATTR5:[0-9]+]] +// CHECK2-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[CALL]], [[CALL9]] +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B4]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: [[CALL11:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK2-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD10]], [[CALL11]] +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK2-NEXT: [[IDXPROM13:%.*]] = sext i32 [[TMP20]] to i64 +// CHECK2-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C5]], i64 0, i64 [[IDXPROM13]] +// CHECK2-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK2-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD12]], [[CALL15]] +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK2-NEXT: [[IDXPROM17:%.*]] = sext i32 [[TMP21]] to i64 +// CHECK2-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i64 0, i64 [[IDXPROM17]] +// CHECK2-NEXT: [[CALL19:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX18]]) #[[ATTR5]] +// CHECK2-NEXT: [[ADD20:%.*]] = add nsw i32 [[ADD16]], [[CALL19]] +// CHECK2-NEXT: store i32 [[ADD20]], i32* [[TMP1]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD21:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK2-NEXT: store i32 [[ADD21]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK2-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK2: .omp.lastprivate.then: +// CHECK2-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK2-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C5]] to i8* +// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i64 40, i1 false) +// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK2: .omp.lastprivate.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 +// CHECK3-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK3-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP5]], i16 [[TMP4]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0 +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct._globalized_locals_ty* +// CHECK3-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP8]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP10]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP11]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP13]], i8* align 4 [[TMP14]], i32 40, i1 false) +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[TMP15:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP16]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] +// CHECK3-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP19]], [[COND_TRUE]] ], [ [[TMP20]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1 +// CHECK3-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP22]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP24]] to i8* +// CHECK3-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP25]] to i8* +// CHECK3-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP31:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK3-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK3-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK3-NEXT: [[TMP35:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK3-NEXT: store i8* [[TMP35]], i8** [[TMP34]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK3-NEXT: [[TMP37:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK3-NEXT: store i8* [[TMP37]], i8** [[TMP36]], align 4 +// CHECK3-NEXT: [[TMP38:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK3-NEXT: [[TMP39:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK3-NEXT: store i8* [[TMP39]], i8** [[TMP38]], align 4 +// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK3-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP40]], 0 +// CHECK3-NEXT: [[TMP41:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK3-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK3-NEXT: [[TMP44:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP43]], i32 [[TMP41]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP44]], i32 7) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP45]], [[TMP46]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP47]], [[TMP48]] +// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP49]], [[TMP50]] +// CHECK3-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP51]], [[TMP52]] +// CHECK3-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK3: cond.true12: +// CHECK3-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: br label [[COND_END14:%.*]] +// CHECK3: cond.false13: +// CHECK3-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END14]] +// CHECK3: cond.end14: +// CHECK3-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP53]], [[COND_TRUE12]] ], [ [[TMP54]], [[COND_FALSE13]] ] +// CHECK3-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP55]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP56:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP57:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP57]]) +// CHECK3-NEXT: [[TMP58:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 +// CHECK3-NEXT: br i1 [[TMP59]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK3: .omp.lastprivate.then: +// CHECK3-NEXT: [[TMP60:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK3-NEXT: [[TMP61:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP60]], i8* align 4 [[TMP61]], i32 40, i1 false) +// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK3: .omp.lastprivate.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: [[TMP62:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP62]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK3-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK3-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK3-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK3-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK3-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK3-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK3-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK3-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK3-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK3-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK3-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK3-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK3-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK3-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK3-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK3-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK3: .omp.lastprivate.then: +// CHECK3-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK3-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK3: .omp.lastprivate.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 +// CHECK4-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK4-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP5]], i16 [[TMP4]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0 +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct._globalized_locals_ty* +// CHECK4-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP8]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP10]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP11]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP13]], i8* align 4 [[TMP14]], i32 40, i1 false) +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[TMP15:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP16]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] +// CHECK4-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP19]], [[COND_TRUE]] ], [ [[TMP20]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1 +// CHECK4-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP22]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP26:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP24]] to i8* +// CHECK4-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK4-NEXT: [[TMP28:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP25]] to i8* +// CHECK4-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK4-NEXT: [[TMP30:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP31:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK4-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK4-NEXT: [[TMP32:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK4-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK4-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 4 +// CHECK4-NEXT: [[TMP34:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK4-NEXT: [[TMP35:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK4-NEXT: store i8* [[TMP35]], i8** [[TMP34]], align 4 +// CHECK4-NEXT: [[TMP36:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK4-NEXT: [[TMP37:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK4-NEXT: store i8* [[TMP37]], i8** [[TMP36]], align 4 +// CHECK4-NEXT: [[TMP38:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK4-NEXT: [[TMP39:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK4-NEXT: store i8* [[TMP39]], i8** [[TMP38]], align 4 +// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK4-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP40]], 0 +// CHECK4-NEXT: [[TMP41:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK4-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK4-NEXT: [[TMP44:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP43]], i32 [[TMP41]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP44]], i32 7) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP45]], [[TMP46]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP47]], [[TMP48]] +// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP49]], [[TMP50]] +// CHECK4-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP51]], [[TMP52]] +// CHECK4-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK4: cond.true12: +// CHECK4-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: br label [[COND_END14:%.*]] +// CHECK4: cond.false13: +// CHECK4-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END14]] +// CHECK4: cond.end14: +// CHECK4-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP53]], [[COND_TRUE12]] ], [ [[TMP54]], [[COND_FALSE13]] ] +// CHECK4-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP55]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP56:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP57:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP57]]) +// CHECK4-NEXT: [[TMP58:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 +// CHECK4-NEXT: br i1 [[TMP59]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK4: .omp.lastprivate.then: +// CHECK4-NEXT: [[TMP60:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK4-NEXT: [[TMP61:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP60]], i8* align 4 [[TMP61]], i32 40, i1 false) +// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK4: .omp.lastprivate.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: [[TMP62:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP62]]) +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK4-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK4-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK4-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK4-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK4-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK4-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK4-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK4-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK4-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK4-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK4-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK4-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK4-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK4-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK4-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK4-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK4-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK4: .omp.lastprivate.then: +// CHECK4-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK4-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK4: .omp.lastprivate.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 +// CHECK5-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK5-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 40, i16 1) +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK5-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i32 40, i1 false) +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] +// CHECK5-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK5-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] +// CHECK5-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK5-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK5-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP28:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK5-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK5-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK5-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK5-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK5-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK5-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK5-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK5-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK5-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK5-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK5-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK5-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 4 +// CHECK5-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK5-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 +// CHECK5-NEXT: [[TMP38:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK5-NEXT: [[TMP39:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4 +// CHECK5-NEXT: [[TMP41:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP40]], i32 [[TMP38]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP41]], i32 7) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK5-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] +// CHECK5-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]] +// CHECK5-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK5: cond.true12: +// CHECK5-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: br label [[COND_END14:%.*]] +// CHECK5: cond.false13: +// CHECK5-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END14]] +// CHECK5: cond.end14: +// CHECK5-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP50]], [[COND_TRUE12]] ], [ [[TMP51]], [[COND_FALSE13]] ] +// CHECK5-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP52]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP53:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP54]]) +// CHECK5-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP56:%.*]] = icmp ne i32 [[TMP55]], 0 +// CHECK5-NEXT: br i1 [[TMP56]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK5: .omp.lastprivate.then: +// CHECK5-NEXT: [[TMP57:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK5-NEXT: [[TMP58:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i32 40, i1 false) +// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK5: .omp.lastprivate.done: +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK5-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK5-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK5-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK5-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK5-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK5-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK5-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK5-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK5-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK5-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK5-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK5-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK5-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK5-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK5-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK5-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK5-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK5: .omp.lastprivate.then: +// CHECK5-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK5-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK5: .omp.lastprivate.done: +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 +// CHECK6-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK6-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 40, i16 1) +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK6-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i32 40, i1 false) +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] +// CHECK6-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK6-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] +// CHECK6-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK6-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK6-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP28:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK6-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK6-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK6-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK6-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK6-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK6-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK6-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK6-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK6-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK6-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK6-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK6-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK6-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 4 +// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK6-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 +// CHECK6-NEXT: [[TMP38:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK6-NEXT: [[TMP39:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4 +// CHECK6-NEXT: [[TMP41:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP40]], i32 [[TMP38]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP41]], i32 7) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK6-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] +// CHECK6-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]] +// CHECK6-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK6: cond.true12: +// CHECK6-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: br label [[COND_END14:%.*]] +// CHECK6: cond.false13: +// CHECK6-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END14]] +// CHECK6: cond.end14: +// CHECK6-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP50]], [[COND_TRUE12]] ], [ [[TMP51]], [[COND_FALSE13]] ] +// CHECK6-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP52]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP53:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP54]]) +// CHECK6-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP56:%.*]] = icmp ne i32 [[TMP55]], 0 +// CHECK6-NEXT: br i1 [[TMP56]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK6: .omp.lastprivate.then: +// CHECK6-NEXT: [[TMP57:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK6-NEXT: [[TMP58:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i32 40, i1 false) +// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK6: .omp.lastprivate.done: +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK6-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK6-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK6-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK6-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK6-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK6-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK6-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK6-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK6-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK6-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK6-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK6-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK6-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK6-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK6-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK6-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK6-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK6: .omp.lastprivate.then: +// CHECK6-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK6-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK6: .omp.lastprivate.done: +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 +// CHECK7-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK7-NEXT: [[ARGC_CASTED:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK7-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK7-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK7-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK7-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK7-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK7-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK7-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK7-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK7: .execute: +// CHECK7-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK7-NEXT: [[CONV1:%.*]] = bitcast i64* [[ARGC_CASTED]] to i32* +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[CONV1]], align 4 +// CHECK7-NEXT: [[TMP6:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8 +// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK7-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i64 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK7: .omp.deinit: +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK7-NEXT: br label [[DOTEXIT:%.*]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK7-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 8 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK7-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK7-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK7-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK7-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK7-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK7-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK7-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK7-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK7-NEXT: [[TMP4:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK7-NEXT: [[TMP5:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 +// CHECK7-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP5]], i16 [[TMP4]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK7-NEXT: [[TMP6:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 +// CHECK7-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i64 0 +// CHECK7-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct._globalized_locals_ty* +// CHECK7-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP8]], i32 0, i32 0 +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK7-NEXT: store i32 [[TMP9]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP10]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK7-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP11]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK7-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK7-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP13]], i8* align 4 [[TMP14]], i64 40, i1 false) +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: [[TMP15:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP16]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] +// CHECK7-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ [[TMP19]], [[COND_TRUE]] ], [ [[TMP20]], [[COND_FALSE]] ] +// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1 +// CHECK7-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP22]], [[ADD]] +// CHECK7-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i64 +// CHECK7-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP27:%.*]] = zext i32 [[TMP26]] to i64 +// CHECK7-NEXT: [[TMP28:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK7-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP25]] to i8* +// CHECK7-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 +// CHECK7-NEXT: [[TMP30:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK7-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP27]] to i8* +// CHECK7-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 +// CHECK7-NEXT: [[TMP32:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK7-NEXT: [[TMP33:%.*]] = bitcast i32* [[CONV]] to i8* +// CHECK7-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 8 +// CHECK7-NEXT: [[TMP34:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK7-NEXT: [[TMP35:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK7-NEXT: store i8* [[TMP35]], i8** [[TMP34]], align 8 +// CHECK7-NEXT: [[TMP36:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK7-NEXT: [[TMP37:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK7-NEXT: store i8* [[TMP37]], i8** [[TMP36]], align 8 +// CHECK7-NEXT: [[TMP38:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 5 +// CHECK7-NEXT: [[TMP39:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK7-NEXT: store i8* [[TMP39]], i8** [[TMP38]], align 8 +// CHECK7-NEXT: [[TMP40:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 6 +// CHECK7-NEXT: [[TMP41:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK7-NEXT: store i8* [[TMP41]], i8** [[TMP40]], align 8 +// CHECK7-NEXT: [[TMP42:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK7-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK7-NEXT: [[TMP43:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK7-NEXT: [[TMP44:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 +// CHECK7-NEXT: [[TMP46:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP45]], i32 [[TMP43]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP46]], i64 7) +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP47]], [[TMP48]] +// CHECK7-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP49]], [[TMP50]] +// CHECK7-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP51]], [[TMP52]] +// CHECK7-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]] +// CHECK7-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK7: cond.true12: +// CHECK7-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: br label [[COND_END14:%.*]] +// CHECK7: cond.false13: +// CHECK7-NEXT: [[TMP56:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END14]] +// CHECK7: cond.end14: +// CHECK7-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP55]], [[COND_TRUE12]] ], [ [[TMP56]], [[COND_FALSE13]] ] +// CHECK7-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP57:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP57]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: [[TMP58:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: [[TMP59:%.*]] = load i32, i32* [[TMP58]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP59]]) +// CHECK7-NEXT: [[TMP60:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP61:%.*]] = icmp ne i32 [[TMP60]], 0 +// CHECK7-NEXT: br i1 [[TMP61]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK7: .omp.lastprivate.then: +// CHECK7-NEXT: [[TMP62:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK7-NEXT: [[TMP63:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK7-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP62]], i8* align 4 [[TMP63]], i64 40, i1 false) +// CHECK7-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK7: .omp.lastprivate.done: +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: [[TMP64:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK7-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP64]]) +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK7-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK7-NEXT: [[C5:%.*]] = alloca [10 x i32], align 4 +// CHECK7-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK7-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK7-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK7-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 +// CHECK7-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK7-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK7-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK7-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK7-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK7-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK7-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK7-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK7-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK7-NEXT: [[CONV:%.*]] = trunc i64 [[TMP9]] to i32 +// CHECK7-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK7-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP10]] to i32 +// CHECK7-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK7-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK7-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i64 40, i1 false) +// CHECK7-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[CONV7:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK7-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK7-NEXT: [[CMP8:%.*]] = icmp ule i64 [[CONV7]], [[TMP17]] +// CHECK7-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK7-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I6]]) #[[ATTR5:[0-9]+]] +// CHECK7-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK7-NEXT: [[ADD10:%.*]] = add nsw i32 [[CALL]], [[CALL9]] +// CHECK7-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK7-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B4]], i64 0, i64 [[IDXPROM]] +// CHECK7-NEXT: [[CALL11:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK7-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD10]], [[CALL11]] +// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK7-NEXT: [[IDXPROM13:%.*]] = sext i32 [[TMP20]] to i64 +// CHECK7-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C5]], i64 0, i64 [[IDXPROM13]] +// CHECK7-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK7-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD12]], [[CALL15]] +// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK7-NEXT: [[IDXPROM17:%.*]] = sext i32 [[TMP21]] to i64 +// CHECK7-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i64 0, i64 [[IDXPROM17]] +// CHECK7-NEXT: [[CALL19:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX18]]) #[[ATTR5]] +// CHECK7-NEXT: [[ADD20:%.*]] = add nsw i32 [[ADD16]], [[CALL19]] +// CHECK7-NEXT: store i32 [[ADD20]], i32* [[TMP1]], align 4 +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK7: omp.body.continue: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD21:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK7-NEXT: store i32 [[ADD21]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK7-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK7-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK7: .omp.lastprivate.then: +// CHECK7-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK7-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C5]] to i8* +// CHECK7-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i64 40, i1 false) +// CHECK7-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK7: .omp.lastprivate.done: +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 +// CHECK8-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[ARGC_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK8-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK8-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK8-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK8-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK8: .execute: +// CHECK8-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[ARGC_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[CONV1]], align 4 +// CHECK8-NEXT: [[TMP6:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8 +// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i64 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK8: .omp.deinit: +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK8-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 8 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK8-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK8-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK8-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK8-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK8-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 40, i16 1) +// CHECK8-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK8-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK8-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK8-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i64 40, i1 false) +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] +// CHECK8-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK8-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] +// CHECK8-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 +// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP24:%.*]] = zext i32 [[TMP23]] to i64 +// CHECK8-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK8-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP22]] to i8* +// CHECK8-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK8-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK8-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP24]] to i8* +// CHECK8-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK8-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK8-NEXT: [[TMP30:%.*]] = bitcast i32* [[CONV]] to i8* +// CHECK8-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK8-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK8-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK8-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 +// CHECK8-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK8-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK8-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 +// CHECK8-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 5 +// CHECK8-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK8-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 +// CHECK8-NEXT: [[TMP37:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 6 +// CHECK8-NEXT: [[TMP38:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK8-NEXT: store i8* [[TMP38]], i8** [[TMP37]], align 8 +// CHECK8-NEXT: [[TMP39:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK8-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP39]], 0 +// CHECK8-NEXT: [[TMP40:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK8-NEXT: [[TMP41:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP42:%.*]] = load i32, i32* [[TMP41]], align 4 +// CHECK8-NEXT: [[TMP43:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP42]], i32 [[TMP40]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP43]], i64 7) +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK8-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] +// CHECK8-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP48]], [[TMP49]] +// CHECK8-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]] +// CHECK8-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK8: cond.true12: +// CHECK8-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: br label [[COND_END14:%.*]] +// CHECK8: cond.false13: +// CHECK8-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END14]] +// CHECK8: cond.end14: +// CHECK8-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP52]], [[COND_TRUE12]] ], [ [[TMP53]], [[COND_FALSE13]] ] +// CHECK8-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP54]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP55:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP56:%.*]] = load i32, i32* [[TMP55]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP56]]) +// CHECK8-NEXT: [[TMP57:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP58:%.*]] = icmp ne i32 [[TMP57]], 0 +// CHECK8-NEXT: br i1 [[TMP58]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK8: .omp.lastprivate.then: +// CHECK8-NEXT: [[TMP59:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK8-NEXT: [[TMP60:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK8-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP59]], i8* align 4 [[TMP60]], i64 40, i1 false) +// CHECK8-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK8: .omp.lastprivate.done: +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK8-NEXT: [[C5:%.*]] = alloca [10 x i32], align 4 +// CHECK8-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 +// CHECK8-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK8-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 +// CHECK8-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK8-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = trunc i64 [[TMP9]] to i32 +// CHECK8-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP10]] to i32 +// CHECK8-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK8-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK8-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i64 40, i1 false) +// CHECK8-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[CONV7:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK8-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CMP8:%.*]] = icmp ule i64 [[CONV7]], [[TMP17]] +// CHECK8-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK8-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I6]]) #[[ATTR5:[0-9]+]] +// CHECK8-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK8-NEXT: [[ADD10:%.*]] = add nsw i32 [[CALL]], [[CALL9]] +// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B4]], i64 0, i64 [[IDXPROM]] +// CHECK8-NEXT: [[CALL11:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK8-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD10]], [[CALL11]] +// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK8-NEXT: [[IDXPROM13:%.*]] = sext i32 [[TMP20]] to i64 +// CHECK8-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C5]], i64 0, i64 [[IDXPROM13]] +// CHECK8-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK8-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD12]], [[CALL15]] +// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK8-NEXT: [[IDXPROM17:%.*]] = sext i32 [[TMP21]] to i64 +// CHECK8-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i64 0, i64 [[IDXPROM17]] +// CHECK8-NEXT: [[CALL19:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX18]]) #[[ATTR5]] +// CHECK8-NEXT: [[ADD20:%.*]] = add nsw i32 [[ADD16]], [[CALL19]] +// CHECK8-NEXT: store i32 [[ADD20]], i32* [[TMP1]], align 4 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK8: omp.body.continue: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD21:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK8-NEXT: store i32 [[ADD21]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK8-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK8-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK8: .omp.lastprivate.then: +// CHECK8-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK8-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C5]] to i8* +// CHECK8-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i64 40, i1 false) +// CHECK8-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK8: .omp.lastprivate.done: +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 +// CHECK9-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK9-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK9: .execute: +// CHECK9-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK9-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK9-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK9: .omp.deinit: +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK9-NEXT: br label [[DOTEXIT:%.*]] +// CHECK9: .exit: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK9-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK9-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK9-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP5]], i16 [[TMP4]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK9-NEXT: [[TMP6:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0 +// CHECK9-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct._globalized_locals_ty* +// CHECK9-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP8]], i32 0, i32 0 +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP9]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP10]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP11]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK9-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK9-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP13]], i8* align 4 [[TMP14]], i32 40, i1 false) +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: [[TMP15:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP16]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] +// CHECK9-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP19]], [[COND_TRUE]] ], [ [[TMP20]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1 +// CHECK9-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP22]], [[ADD]] +// CHECK9-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP26:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK9-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP24]] to i8* +// CHECK9-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK9-NEXT: [[TMP28:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK9-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP25]] to i8* +// CHECK9-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK9-NEXT: [[TMP30:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK9-NEXT: [[TMP31:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK9-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK9-NEXT: [[TMP32:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK9-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK9-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 4 +// CHECK9-NEXT: [[TMP34:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK9-NEXT: [[TMP35:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK9-NEXT: store i8* [[TMP35]], i8** [[TMP34]], align 4 +// CHECK9-NEXT: [[TMP36:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK9-NEXT: [[TMP37:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK9-NEXT: store i8* [[TMP37]], i8** [[TMP36]], align 4 +// CHECK9-NEXT: [[TMP38:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK9-NEXT: [[TMP39:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK9-NEXT: store i8* [[TMP39]], i8** [[TMP38]], align 4 +// CHECK9-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK9-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP40]], 0 +// CHECK9-NEXT: [[TMP41:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK9-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK9-NEXT: [[TMP44:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP43]], i32 [[TMP41]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP44]], i32 7) +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP45]], [[TMP46]] +// CHECK9-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP47]], [[TMP48]] +// CHECK9-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP49]], [[TMP50]] +// CHECK9-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP51]], [[TMP52]] +// CHECK9-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK9: cond.true12: +// CHECK9-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: br label [[COND_END14:%.*]] +// CHECK9: cond.false13: +// CHECK9-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END14]] +// CHECK9: cond.end14: +// CHECK9-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP53]], [[COND_TRUE12]] ], [ [[TMP54]], [[COND_FALSE13]] ] +// CHECK9-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP55]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP56:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP57:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP57]]) +// CHECK9-NEXT: [[TMP58:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 +// CHECK9-NEXT: br i1 [[TMP59]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK9: .omp.lastprivate.then: +// CHECK9-NEXT: [[TMP60:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK9-NEXT: [[TMP61:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK9-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP60]], i8* align 4 [[TMP61]], i32 40, i1 false) +// CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK9: .omp.lastprivate.done: +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: [[TMP62:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK9-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP62]]) +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK9-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK9-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK9-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK9-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK9-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK9-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK9-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK9-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK9-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK9-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK9-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK9-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK9-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK9-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK9-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK9-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK9-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK9-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK9: omp.body.continue: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK9-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK9-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK9-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK9: .omp.lastprivate.then: +// CHECK9-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK9-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK9-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK9: .omp.lastprivate.done: +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 +// CHECK10-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK10-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK10: .execute: +// CHECK10-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK10-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK10-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK10: .omp.deinit: +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK10-NEXT: br label [[DOTEXIT:%.*]] +// CHECK10: .exit: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK10-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK10-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK10-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP5]], i16 [[TMP4]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK10-NEXT: [[TMP6:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0 +// CHECK10-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct._globalized_locals_ty* +// CHECK10-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP8]], i32 0, i32 0 +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP9]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP10]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK10-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP11]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK10-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK10-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP13]], i8* align 4 [[TMP14]], i32 40, i1 false) +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: [[TMP15:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP16]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] +// CHECK10-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK10: cond.true: +// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: br label [[COND_END:%.*]] +// CHECK10: cond.false: +// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END]] +// CHECK10: cond.end: +// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP19]], [[COND_TRUE]] ], [ [[TMP20]], [[COND_FALSE]] ] +// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1 +// CHECK10-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP22]], [[ADD]] +// CHECK10-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP26:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK10-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP24]] to i8* +// CHECK10-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK10-NEXT: [[TMP28:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK10-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP25]] to i8* +// CHECK10-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK10-NEXT: [[TMP30:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK10-NEXT: [[TMP31:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK10-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK10-NEXT: [[TMP32:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK10-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK10-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 4 +// CHECK10-NEXT: [[TMP34:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK10-NEXT: [[TMP35:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK10-NEXT: store i8* [[TMP35]], i8** [[TMP34]], align 4 +// CHECK10-NEXT: [[TMP36:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK10-NEXT: [[TMP37:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK10-NEXT: store i8* [[TMP37]], i8** [[TMP36]], align 4 +// CHECK10-NEXT: [[TMP38:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK10-NEXT: [[TMP39:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK10-NEXT: store i8* [[TMP39]], i8** [[TMP38]], align 4 +// CHECK10-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK10-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP40]], 0 +// CHECK10-NEXT: [[TMP41:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK10-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK10-NEXT: [[TMP44:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP43]], i32 [[TMP41]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP44]], i32 7) +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP45]], [[TMP46]] +// CHECK10-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP47]], [[TMP48]] +// CHECK10-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP49]], [[TMP50]] +// CHECK10-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP51]], [[TMP52]] +// CHECK10-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK10: cond.true12: +// CHECK10-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: br label [[COND_END14:%.*]] +// CHECK10: cond.false13: +// CHECK10-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END14]] +// CHECK10: cond.end14: +// CHECK10-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP53]], [[COND_TRUE12]] ], [ [[TMP54]], [[COND_FALSE13]] ] +// CHECK10-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP55]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: [[TMP56:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP57:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP57]]) +// CHECK10-NEXT: [[TMP58:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 +// CHECK10-NEXT: br i1 [[TMP59]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK10: .omp.lastprivate.then: +// CHECK10-NEXT: [[TMP60:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK10-NEXT: [[TMP61:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK10-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP60]], i8* align 4 [[TMP61]], i32 40, i1 false) +// CHECK10-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK10: .omp.lastprivate.done: +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: [[TMP62:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK10-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP62]]) +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK10-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK10-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK10-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK10-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK10-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK10-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK10-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK10-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK10-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK10-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK10-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK10-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK10-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK10-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK10-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK10-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK10-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK10-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK10-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK10: omp.body.continue: +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK10-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK10-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK10-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK10: .omp.lastprivate.then: +// CHECK10-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK10-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK10-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK10-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK10: .omp.lastprivate.done: +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 +// CHECK11-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK11-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 40, i16 1) +// CHECK11-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK11-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK11-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i32 40, i1 false) +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] +// CHECK11-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK11-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] +// CHECK11-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK11-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK11-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK11-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK11-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK11-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK11-NEXT: [[TMP28:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK11-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK11-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK11-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK11-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK11-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK11-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK11-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK11-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK11-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK11-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK11-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK11-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK11-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 4 +// CHECK11-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK11-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 +// CHECK11-NEXT: [[TMP38:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK11-NEXT: [[TMP39:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4 +// CHECK11-NEXT: [[TMP41:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP40]], i32 [[TMP38]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP41]], i32 7) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK11-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] +// CHECK11-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]] +// CHECK11-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK11: cond.true12: +// CHECK11-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: br label [[COND_END14:%.*]] +// CHECK11: cond.false13: +// CHECK11-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END14]] +// CHECK11: cond.end14: +// CHECK11-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP50]], [[COND_TRUE12]] ], [ [[TMP51]], [[COND_FALSE13]] ] +// CHECK11-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP52]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP53:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP54]]) +// CHECK11-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP56:%.*]] = icmp ne i32 [[TMP55]], 0 +// CHECK11-NEXT: br i1 [[TMP56]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK11: .omp.lastprivate.then: +// CHECK11-NEXT: [[TMP57:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK11-NEXT: [[TMP58:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i32 40, i1 false) +// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK11: .omp.lastprivate.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK11-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK11-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK11-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK11-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK11-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK11-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK11-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK11-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK11-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK11-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK11-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK11-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK11-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK11-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK11-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK11-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK11-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK11-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK11-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK11: .omp.lastprivate.then: +// CHECK11-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK11-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK11: .omp.lastprivate.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 +// CHECK12-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK12-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 40, i16 1) +// CHECK12-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK12-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK12-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i32 40, i1 false) +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] +// CHECK12-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK12-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] +// CHECK12-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK12-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK12-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK12-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK12-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP28:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK12-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK12-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK12-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK12-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK12-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK12-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK12-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK12-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK12-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK12-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK12-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK12-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK12-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 4 +// CHECK12-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK12-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 +// CHECK12-NEXT: [[TMP38:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK12-NEXT: [[TMP39:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4 +// CHECK12-NEXT: [[TMP41:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP40]], i32 [[TMP38]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP41]], i32 7) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK12-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] +// CHECK12-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]] +// CHECK12-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK12: cond.true12: +// CHECK12-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: br label [[COND_END14:%.*]] +// CHECK12: cond.false13: +// CHECK12-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END14]] +// CHECK12: cond.end14: +// CHECK12-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP50]], [[COND_TRUE12]] ], [ [[TMP51]], [[COND_FALSE13]] ] +// CHECK12-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP52]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP53:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP54]]) +// CHECK12-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP56:%.*]] = icmp ne i32 [[TMP55]], 0 +// CHECK12-NEXT: br i1 [[TMP56]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK12: .omp.lastprivate.then: +// CHECK12-NEXT: [[TMP57:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK12-NEXT: [[TMP58:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i32 40, i1 false) +// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK12: .omp.lastprivate.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK12-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK12-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK12-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK12-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK12-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK12-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK12-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK12-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK12-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK12-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK12-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK12-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK12-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK12-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK12-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK12-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK12-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK12-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK12-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK12-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK12: .omp.lastprivate.then: +// CHECK12-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK12-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK12: .omp.lastprivate.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: ret void +// diff --git a/clang/test/OpenMP/nvptx_lambda_capturing.cpp b/clang/test/OpenMP/nvptx_lambda_capturing.cpp --- a/clang/test/OpenMP/nvptx_lambda_capturing.cpp +++ b/clang/test/OpenMP/nvptx_lambda_capturing.cpp @@ -1,58 +1,18 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" --prefix-filecheck-ir-name _ // REQUIRES: powerpc-registered-target // REQUIRES: nvptx-registered-target -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -o - | FileCheck %s --check-prefix HOST +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK1 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefixes=CLASS,FUN,CHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK2 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - | FileCheck %s --check-prefixes=CLASS,CHECK -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - | FileCheck %s --check-prefixes=FUN,CHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - | FileCheck %s --check-prefix CHECK3 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - | FileCheck %s --check-prefix CHECK4 // expected-no-diagnostics #ifndef HEADER #define HEADER -// HOST-DAG: = private unnamed_addr constant [11 x i64] [i64 4, i64 4, i64 4, i64 0, i64 4, i64 40, i64 4, i64 4, i64 4, i64 8, i64 4] -// HOST-DAG: = private unnamed_addr constant [11 x i64] [i64 288, i64 673, i64 673, i64 544, i64 33, i64 673, i64 1688849860264720, i64 1688849860264720, i64 1688849860264720, i64 1688849860264720, i64 1688849860264720] -// HOST-DAG: = private unnamed_addr constant [11 x i64] [i64 4, i64 4, i64 4, i64 0, i64 4, i64 40, i64 4, i64 4, i64 4, i64 8, i64 4] -// HOST-DAG: = private unnamed_addr constant [11 x i64] [i64 673, i64 673, i64 673, i64 544, i64 673, i64 673, i64 1688849860264720, i64 1688849860264720, i64 1688849860264720, i64 1688849860264720, i64 1688849860264720] -// HOST-DAG: = private unnamed_addr constant [3 x i64] [i64 4, i64 8, i64 8] -// HOST-DAG: = private unnamed_addr constant [3 x i64] [i64 547, i64 673, i64 562949953422096] -// HOST-DAG: = private unnamed_addr constant [3 x i64] [i64 4, i64 8, i64 8] -// HOST-DAG: = private unnamed_addr constant [3 x i64] [i64 547, i64 673, i64 562949953422096] -// HOST-DAG: = private unnamed_addr constant [2 x i64] [i64 8, i64 8] -// HOST-DAG: = private unnamed_addr constant [2 x i64] [i64 673, i64 281474976711440] -// CHECK-DAG: [[S:%.+]] = type { i32 } -// CHECK-DAG: [[CAP1:%.+]] = type { [[S]]* } -// CHECK-DAG: [[CAP2:%.+]] = type { i32*, i32*, i32*, i32**, i32* } - -// CLASS: define internal void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l67_worker() -// CLASS: define weak void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l67([[S]]* {{%.+}}, [[CAP1]]* nonnull align 8 dereferenceable(8) {{%.+}}) -// CLASS-NOT: getelementptr -// CLASS: br i1 % -// CLASS: call void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l67_worker() -// CLASS: br label % -// CLASS: br i1 % -// CLASS: call void @__kmpc_kernel_init( -// CLASS: call void @__kmpc_data_sharing_init_stack() -// CLASS: call void @llvm.memcpy. -// CLASS: [[L:%.+]] = load [[CAP1]]*, [[CAP1]]** [[L_ADDR:%.+]], -// CLASS: [[THIS_REF:%.+]] = getelementptr inbounds [[CAP1]], [[CAP1]]* [[L]], i32 0, i32 0 -// CLASS: store [[S]]* [[S_:%.+]], [[S]]** [[THIS_REF]], -// CLASS: [[L:%.+]] = load [[CAP1]]*, [[CAP1]]** [[L_ADDR]], -// CLASS: call i32 [[LAMBDA1:@.+foo.+]]([[CAP1]]* {{[^,]*}} [[L]]) -// CLASS: ret void - -// CLASS: define weak void @__omp_offloading_{{.+}}foo{{.+}}_l69([[S]]* %{{.+}}, [[CAP1]]* nonnull align 8 dereferenceable(8) %{{.+}}) -// CLASS-NOT: getelementptr -// CLASS: call void @llvm.memcpy. -// CLASS: [[L:%.+]] = load [[CAP1]]*, [[CAP1]]** [[L_ADDR:%.+]], -// CLASS: [[THIS_REF:%.+]] = getelementptr inbounds [[CAP1]], [[CAP1]]* [[L]], i32 0, i32 0 -// CLASS: store [[S]]* %{{.+}}, [[S]]** [[THIS_REF]], -// CLASS: [[L:%.+]] = load [[CAP1]]*, [[CAP1]]** [[L_ADDR]], -// CLASS: call i32 [[LAMBDA1]]([[CAP1]]* {{[^,]*}} [[L]]) -// CLASS: ret void - template int foo(const T &t) { #pragma omp target parallel @@ -72,49 +32,6 @@ } } s; -// FUN: define internal void @__omp_offloading_{{.+}}_main_l124_worker() -// FUN: define weak void @__omp_offloading_{{.+}}_main_l124(i64 %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, [[CAP2]]* nonnull align 8 dereferenceable(40) %{{.+}}) -// FUN-NOT: getelementptr -// FUN: br i1 % -// FUN: call void @__omp_offloading_{{.*}}_{{.*}}main{{.*}}_l124_worker() -// FUN: br label % -// FUN: br i1 % -// FUN: call void @__kmpc_kernel_init( -// FUN: call void @__kmpc_data_sharing_init_stack() -// FUN: call void @llvm.memcpy. -// FUN: [[L:%.+]] = load [[CAP2]]*, [[CAP2]]** [[L_ADDR:%.+]], -// FUN: [[ARGC_CAP:%.+]] = getelementptr inbounds [[CAP2]], [[CAP2]]* [[L]], i32 0, i32 0 -// FUN: store i32* %{{.+}}, i32** [[ARGC_CAP]], -// FUN: [[B_CAP:%.+]] = getelementptr inbounds [[CAP2]], [[CAP2]]* [[L]], i32 0, i32 1 -// FUN: store i32* %{{.+}}, i32** [[B_CAP]], -// FUN: [[C_CAP:%.+]] = getelementptr inbounds [[CAP2]], [[CAP2]]* [[L]], i32 0, i32 2 -// FUN: store i32* %{{.+}}, i32** [[C_CAP]], -// FUN: [[D_CAP:%.+]] = getelementptr inbounds [[CAP2]], [[CAP2]]* [[L]], i32 0, i32 3 -// FUN: store i32** %{{.+}}, i32*** [[D_CAP]], -// FUN: [[A_CAP:%.+]] = getelementptr inbounds [[CAP2]], [[CAP2]]* [[L]], i32 0, i32 4 -// FUN: store i32* %{{.+}}, i32** [[A_CAP]], -// FUN: [[L:%.+]] = load [[CAP2]]*, [[CAP2]]** [[L_ADDR]], -// FUN: call i64 [[LAMBDA2:@.+main.+]]([[CAP2]]* {{[^,]*}} [[L]]) -// FUN: ret void - -// FUN: define weak void @__omp_offloading_{{.+}}_main_l126(i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}} i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, [[CAP2]]* nonnull align 8 dereferenceable(40) %{{.+}}) -// FUN-NOT: getelementptr -// FUN: call void @llvm.memcpy. -// FUN: [[L:%.+]] = load [[CAP2]]*, [[CAP2]]** [[L_ADDR:%.+]], -// FUN: [[ARGC_CAP:%.+]] = getelementptr inbounds [[CAP2]], [[CAP2]]* [[L]], i32 0, i32 0 -// FUN: store i32* %{{.+}}, i32** [[ARGC_CAP]], -// FUN: [[B_CAP:%.+]] = getelementptr inbounds [[CAP2]], [[CAP2]]* [[L]], i32 0, i32 1 -// FUN: store i32* %{{.+}}, i32** [[B_CAP]], -// FUN: [[C_CAP:%.+]] = getelementptr inbounds [[CAP2]], [[CAP2]]* [[L]], i32 0, i32 2 -// FUN: store i32* %{{.+}}, i32** [[C_CAP]], -// FUN: [[D_CAP:%.+]] = getelementptr inbounds [[CAP2]], [[CAP2]]* [[L]], i32 0, i32 3 -// FUN: store i32** %{{.+}}, i32*** [[D_CAP]], -// FUN: [[A_CAP:%.+]] = getelementptr inbounds [[CAP2]], [[CAP2]]* [[L]], i32 0, i32 4 -// FUN: store i32* %{{.+}}, i32** [[A_CAP]], -// FUN: [[L:%.+]] = load [[CAP2]]*, [[CAP2]]** [[L_ADDR]], -// FUN: call i64 [[LAMBDA2]]([[CAP2]]* {{[^,]*}} [[L]]) -// FUN: ret void - int main(int argc, char **argv) { int &b = argc; int &&c = 1; @@ -128,19 +45,2188 @@ return argc + s.foo(); } - -// HOST-LABEL: @main - -// HOST-DAG: call i32 @__tgt_target_mapper(%struct.ident_t* @{{.+}}, i64 -1, i8* @{{.+}}, i32 11, i8** [[BASES:%.+]], i8** [[PTRS:%.+]], -// HOST-DAG: [[BASES:%.+]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[BASE_PTR:%.+]], i32 0, i32 0 -// HOST-DAG: [[PTRS:%.+]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[PTR_PTR:%.+]], i32 0, i32 0 -// HOST-DAG: [[BASE_REF:%.+]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[BASE_PTR]], i32 0, i32 6 -// HOST-DAG: [[BASE_REF_CAST:%.+]] = bitcast i8** [[BASE_REF]] to i32*** -// HOST-DAG: store i32** [[BASE:%.+]], i32*** [[BASE_REF_CAST]], -// HOST-DAG: [[BASE]] = getelementptr inbounds [[LAMBDA:%.+]], [[LAMBDA]]* [[LAMBDA_ADDR:%.+]], i32 0, i32 0 -// HOST-DAG: [[PTR_REF:%.+]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[PTR_PTR]], i32 0, i32 6 -// HOST-DAG: [[PTR_REF_CAST:%.+]] = bitcast i8** [[PTR_REF]] to i32** -// HOST-DAG: store i32* [[PTR:%.+]], i32** [[PTR_REF_CAST]], -// HOST-DAG: [[PTR]] = load i32*, i32** [[PTR_REF:%.+]], -// HOST-DAG: [[PTR_REF]] = getelementptr inbounds [[LAMBDA]], [[LAMBDA]]* [[LAMBDA_ADDR]], i32 0, i32 0 #endif // HEADER +// CHECK1-LABEL: define {{[^@]+}}@main +// CHECK1-SAME: (i32 signext [[ARGC:%.*]], i8** [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[ARGV_ADDR:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: [[B:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[C:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[REF_TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[D:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[L:%.*]] = alloca %class.anon*, align 8 +// CHECK1-NEXT: [[REF_TMP1:%.*]] = alloca [[CLASS_ANON:%.*]], align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[_TMP2:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[_TMP3:%.*]] = alloca %class.anon*, align 8 +// CHECK1-NEXT: [[ARGC_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [11 x i8*], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [11 x i8*], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [11 x i8*], align 8 +// CHECK1-NEXT: [[_TMP4:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[_TMP5:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[_TMP6:%.*]] = alloca %class.anon*, align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS7:%.*]] = alloca [11 x i8*], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_PTRS8:%.*]] = alloca [11 x i8*], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS9:%.*]] = alloca [11 x i8*], align 8 +// CHECK1-NEXT: store i32 0, i32* [[RETVAL]], align 4 +// CHECK1-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK1-NEXT: store i8** [[ARGV]], i8*** [[ARGV_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[ARGC_ADDR]], i32** [[B]], align 8 +// CHECK1-NEXT: store i32 1, i32* [[REF_TMP]], align 4 +// CHECK1-NEXT: store i32* [[REF_TMP]], i32** [[C]], align 8 +// CHECK1-NEXT: store i32* [[ARGC_ADDR]], i32** [[D]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[REF_TMP1]], i32 0, i32 0 +// CHECK1-NEXT: store i32* [[ARGC_ADDR]], i32** [[TMP0]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[REF_TMP1]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[B]], align 8 +// CHECK1-NEXT: store i32* [[TMP2]], i32** [[TMP1]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[REF_TMP1]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32*, i32** [[C]], align 8 +// CHECK1-NEXT: store i32* [[TMP4]], i32** [[TMP3]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[REF_TMP1]], i32 0, i32 3 +// CHECK1-NEXT: store i32** [[D]], i32*** [[TMP5]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[REF_TMP1]], i32 0, i32 4 +// CHECK1-NEXT: store i32* [[A]], i32** [[TMP6]], align 8 +// CHECK1-NEXT: store %class.anon* [[REF_TMP1]], %class.anon** [[L]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[B]], align 8 +// CHECK1-NEXT: store i32* [[TMP7]], i32** [[TMP]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32*, i32** [[C]], align 8 +// CHECK1-NEXT: store i32* [[TMP8]], i32** [[_TMP2]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = load %class.anon*, %class.anon** [[L]], align 8 +// CHECK1-NEXT: store %class.anon* [[TMP9]], %class.anon** [[_TMP3]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP10]], i32* [[CONV]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32*, i32** [[TMP]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32*, i32** [[_TMP2]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = load i32*, i32** [[D]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = load %class.anon*, %class.anon** [[_TMP3]], align 8 +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP15]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP15]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32*, i32** [[TMP17]], align 8 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP15]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP15]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP21:%.*]] = load i32*, i32** [[TMP20]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP15]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP15]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32*, i32** [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP15]], i32 0, i32 3 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP15]], i32 0, i32 3 +// CHECK1-NEXT: [[TMP27:%.*]] = load i32**, i32*** [[TMP26]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP15]], i32 0, i32 4 +// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP15]], i32 0, i32 4 +// CHECK1-NEXT: [[TMP30:%.*]] = load i32*, i32** [[TMP29]], align 8 +// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP32:%.*]] = bitcast i8** [[TMP31]] to i64* +// CHECK1-NEXT: store i64 [[TMP11]], i64* [[TMP32]], align 8 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP34:%.*]] = bitcast i8** [[TMP33]] to i64* +// CHECK1-NEXT: store i64 [[TMP11]], i64* [[TMP34]], align 8 +// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CHECK1-NEXT: store i8* null, i8** [[TMP35]], align 8 +// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP37:%.*]] = bitcast i8** [[TMP36]] to i32** +// CHECK1-NEXT: store i32* [[TMP12]], i32** [[TMP37]], align 8 +// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP39:%.*]] = bitcast i8** [[TMP38]] to i32** +// CHECK1-NEXT: store i32* [[TMP12]], i32** [[TMP39]], align 8 +// CHECK1-NEXT: [[TMP40:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1 +// CHECK1-NEXT: store i8* null, i8** [[TMP40]], align 8 +// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP42:%.*]] = bitcast i8** [[TMP41]] to i32** +// CHECK1-NEXT: store i32* [[TMP13]], i32** [[TMP42]], align 8 +// CHECK1-NEXT: [[TMP43:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP44:%.*]] = bitcast i8** [[TMP43]] to i32** +// CHECK1-NEXT: store i32* [[TMP13]], i32** [[TMP44]], align 8 +// CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2 +// CHECK1-NEXT: store i8* null, i8** [[TMP45]], align 8 +// CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3 +// CHECK1-NEXT: [[TMP47:%.*]] = bitcast i8** [[TMP46]] to i32** +// CHECK1-NEXT: store i32* [[TMP14]], i32** [[TMP47]], align 8 +// CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 3 +// CHECK1-NEXT: [[TMP49:%.*]] = bitcast i8** [[TMP48]] to i32** +// CHECK1-NEXT: store i32* [[TMP14]], i32** [[TMP49]], align 8 +// CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 3 +// CHECK1-NEXT: store i8* null, i8** [[TMP50]], align 8 +// CHECK1-NEXT: [[TMP51:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 4 +// CHECK1-NEXT: [[TMP52:%.*]] = bitcast i8** [[TMP51]] to i32** +// CHECK1-NEXT: store i32* [[A]], i32** [[TMP52]], align 8 +// CHECK1-NEXT: [[TMP53:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 4 +// CHECK1-NEXT: [[TMP54:%.*]] = bitcast i8** [[TMP53]] to i32** +// CHECK1-NEXT: store i32* [[A]], i32** [[TMP54]], align 8 +// CHECK1-NEXT: [[TMP55:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 4 +// CHECK1-NEXT: store i8* null, i8** [[TMP55]], align 8 +// CHECK1-NEXT: [[TMP56:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 5 +// CHECK1-NEXT: [[TMP57:%.*]] = bitcast i8** [[TMP56]] to %class.anon** +// CHECK1-NEXT: store %class.anon* [[TMP15]], %class.anon** [[TMP57]], align 8 +// CHECK1-NEXT: [[TMP58:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 5 +// CHECK1-NEXT: [[TMP59:%.*]] = bitcast i8** [[TMP58]] to %class.anon** +// CHECK1-NEXT: store %class.anon* [[TMP15]], %class.anon** [[TMP59]], align 8 +// CHECK1-NEXT: [[TMP60:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 5 +// CHECK1-NEXT: store i8* null, i8** [[TMP60]], align 8 +// CHECK1-NEXT: [[TMP61:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 6 +// CHECK1-NEXT: [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i32*** +// CHECK1-NEXT: store i32** [[TMP16]], i32*** [[TMP62]], align 8 +// CHECK1-NEXT: [[TMP63:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 6 +// CHECK1-NEXT: [[TMP64:%.*]] = bitcast i8** [[TMP63]] to i32** +// CHECK1-NEXT: store i32* [[TMP18]], i32** [[TMP64]], align 8 +// CHECK1-NEXT: [[TMP65:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 6 +// CHECK1-NEXT: store i8* null, i8** [[TMP65]], align 8 +// CHECK1-NEXT: [[TMP66:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 7 +// CHECK1-NEXT: [[TMP67:%.*]] = bitcast i8** [[TMP66]] to i32*** +// CHECK1-NEXT: store i32** [[TMP19]], i32*** [[TMP67]], align 8 +// CHECK1-NEXT: [[TMP68:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 7 +// CHECK1-NEXT: [[TMP69:%.*]] = bitcast i8** [[TMP68]] to i32** +// CHECK1-NEXT: store i32* [[TMP21]], i32** [[TMP69]], align 8 +// CHECK1-NEXT: [[TMP70:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 7 +// CHECK1-NEXT: store i8* null, i8** [[TMP70]], align 8 +// CHECK1-NEXT: [[TMP71:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 8 +// CHECK1-NEXT: [[TMP72:%.*]] = bitcast i8** [[TMP71]] to i32*** +// CHECK1-NEXT: store i32** [[TMP22]], i32*** [[TMP72]], align 8 +// CHECK1-NEXT: [[TMP73:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 8 +// CHECK1-NEXT: [[TMP74:%.*]] = bitcast i8** [[TMP73]] to i32** +// CHECK1-NEXT: store i32* [[TMP24]], i32** [[TMP74]], align 8 +// CHECK1-NEXT: [[TMP75:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 8 +// CHECK1-NEXT: store i8* null, i8** [[TMP75]], align 8 +// CHECK1-NEXT: [[TMP76:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 9 +// CHECK1-NEXT: [[TMP77:%.*]] = bitcast i8** [[TMP76]] to i32**** +// CHECK1-NEXT: store i32*** [[TMP25]], i32**** [[TMP77]], align 8 +// CHECK1-NEXT: [[TMP78:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 9 +// CHECK1-NEXT: [[TMP79:%.*]] = bitcast i8** [[TMP78]] to i32*** +// CHECK1-NEXT: store i32** [[TMP27]], i32*** [[TMP79]], align 8 +// CHECK1-NEXT: [[TMP80:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 9 +// CHECK1-NEXT: store i8* null, i8** [[TMP80]], align 8 +// CHECK1-NEXT: [[TMP81:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 10 +// CHECK1-NEXT: [[TMP82:%.*]] = bitcast i8** [[TMP81]] to i32*** +// CHECK1-NEXT: store i32** [[TMP28]], i32*** [[TMP82]], align 8 +// CHECK1-NEXT: [[TMP83:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 10 +// CHECK1-NEXT: [[TMP84:%.*]] = bitcast i8** [[TMP83]] to i32** +// CHECK1-NEXT: store i32* [[TMP30]], i32** [[TMP84]], align 8 +// CHECK1-NEXT: [[TMP85:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 10 +// CHECK1-NEXT: store i8* null, i8** [[TMP85]], align 8 +// CHECK1-NEXT: [[TMP86:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP87:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP88:%.*]] = call i32 @__tgt_target_mapper(%struct.ident_t* @[[GLOB1:[0-9]+]], i64 -1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41.region_id, i32 11, i8** [[TMP86]], i8** [[TMP87]], i64* getelementptr inbounds ([11 x i64], [11 x i64]* @.offload_sizes, i32 0, i32 0), i64* getelementptr inbounds ([11 x i64], [11 x i64]* @.offload_maptypes, i32 0, i32 0), i8** null, i8** null) +// CHECK1-NEXT: [[TMP89:%.*]] = icmp ne i32 [[TMP88]], 0 +// CHECK1-NEXT: br i1 [[TMP89]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// CHECK1: omp_offload.failed: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41(i64 [[TMP11]], i32* [[TMP12]], i32* [[TMP13]], i32* [[TMP14]], i32* [[A]], %class.anon* [[TMP15]]) #[[ATTR4:[0-9]+]] +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] +// CHECK1: omp_offload.cont: +// CHECK1-NEXT: [[TMP90:%.*]] = load i32*, i32** [[B]], align 8 +// CHECK1-NEXT: store i32* [[TMP90]], i32** [[_TMP4]], align 8 +// CHECK1-NEXT: [[TMP91:%.*]] = load i32*, i32** [[C]], align 8 +// CHECK1-NEXT: store i32* [[TMP91]], i32** [[_TMP5]], align 8 +// CHECK1-NEXT: [[TMP92:%.*]] = load %class.anon*, %class.anon** [[L]], align 8 +// CHECK1-NEXT: store %class.anon* [[TMP92]], %class.anon** [[_TMP6]], align 8 +// CHECK1-NEXT: [[TMP93:%.*]] = load i32*, i32** [[_TMP4]], align 8 +// CHECK1-NEXT: [[TMP94:%.*]] = load i32*, i32** [[_TMP5]], align 8 +// CHECK1-NEXT: [[TMP95:%.*]] = load i32*, i32** [[D]], align 8 +// CHECK1-NEXT: [[TMP96:%.*]] = load %class.anon*, %class.anon** [[_TMP6]], align 8 +// CHECK1-NEXT: [[TMP97:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP96]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP98:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP96]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP99:%.*]] = load i32*, i32** [[TMP98]], align 8 +// CHECK1-NEXT: [[TMP100:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP96]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP101:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP96]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP102:%.*]] = load i32*, i32** [[TMP101]], align 8 +// CHECK1-NEXT: [[TMP103:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP96]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP104:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP96]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP105:%.*]] = load i32*, i32** [[TMP104]], align 8 +// CHECK1-NEXT: [[TMP106:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP96]], i32 0, i32 3 +// CHECK1-NEXT: [[TMP107:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP96]], i32 0, i32 3 +// CHECK1-NEXT: [[TMP108:%.*]] = load i32**, i32*** [[TMP107]], align 8 +// CHECK1-NEXT: [[TMP109:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP96]], i32 0, i32 4 +// CHECK1-NEXT: [[TMP110:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP96]], i32 0, i32 4 +// CHECK1-NEXT: [[TMP111:%.*]] = load i32*, i32** [[TMP110]], align 8 +// CHECK1-NEXT: [[TMP112:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP113:%.*]] = bitcast i8** [[TMP112]] to i32** +// CHECK1-NEXT: store i32* [[ARGC_ADDR]], i32** [[TMP113]], align 8 +// CHECK1-NEXT: [[TMP114:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS8]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP115:%.*]] = bitcast i8** [[TMP114]] to i32** +// CHECK1-NEXT: store i32* [[ARGC_ADDR]], i32** [[TMP115]], align 8 +// CHECK1-NEXT: [[TMP116:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS9]], i64 0, i64 0 +// CHECK1-NEXT: store i8* null, i8** [[TMP116]], align 8 +// CHECK1-NEXT: [[TMP117:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP118:%.*]] = bitcast i8** [[TMP117]] to i32** +// CHECK1-NEXT: store i32* [[TMP93]], i32** [[TMP118]], align 8 +// CHECK1-NEXT: [[TMP119:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS8]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP120:%.*]] = bitcast i8** [[TMP119]] to i32** +// CHECK1-NEXT: store i32* [[TMP93]], i32** [[TMP120]], align 8 +// CHECK1-NEXT: [[TMP121:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS9]], i64 0, i64 1 +// CHECK1-NEXT: store i8* null, i8** [[TMP121]], align 8 +// CHECK1-NEXT: [[TMP122:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP123:%.*]] = bitcast i8** [[TMP122]] to i32** +// CHECK1-NEXT: store i32* [[TMP94]], i32** [[TMP123]], align 8 +// CHECK1-NEXT: [[TMP124:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS8]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP125:%.*]] = bitcast i8** [[TMP124]] to i32** +// CHECK1-NEXT: store i32* [[TMP94]], i32** [[TMP125]], align 8 +// CHECK1-NEXT: [[TMP126:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS9]], i64 0, i64 2 +// CHECK1-NEXT: store i8* null, i8** [[TMP126]], align 8 +// CHECK1-NEXT: [[TMP127:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 3 +// CHECK1-NEXT: [[TMP128:%.*]] = bitcast i8** [[TMP127]] to i32** +// CHECK1-NEXT: store i32* [[TMP95]], i32** [[TMP128]], align 8 +// CHECK1-NEXT: [[TMP129:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS8]], i32 0, i32 3 +// CHECK1-NEXT: [[TMP130:%.*]] = bitcast i8** [[TMP129]] to i32** +// CHECK1-NEXT: store i32* [[TMP95]], i32** [[TMP130]], align 8 +// CHECK1-NEXT: [[TMP131:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS9]], i64 0, i64 3 +// CHECK1-NEXT: store i8* null, i8** [[TMP131]], align 8 +// CHECK1-NEXT: [[TMP132:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 4 +// CHECK1-NEXT: [[TMP133:%.*]] = bitcast i8** [[TMP132]] to i32** +// CHECK1-NEXT: store i32* [[A]], i32** [[TMP133]], align 8 +// CHECK1-NEXT: [[TMP134:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS8]], i32 0, i32 4 +// CHECK1-NEXT: [[TMP135:%.*]] = bitcast i8** [[TMP134]] to i32** +// CHECK1-NEXT: store i32* [[A]], i32** [[TMP135]], align 8 +// CHECK1-NEXT: [[TMP136:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS9]], i64 0, i64 4 +// CHECK1-NEXT: store i8* null, i8** [[TMP136]], align 8 +// CHECK1-NEXT: [[TMP137:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 5 +// CHECK1-NEXT: [[TMP138:%.*]] = bitcast i8** [[TMP137]] to %class.anon** +// CHECK1-NEXT: store %class.anon* [[TMP96]], %class.anon** [[TMP138]], align 8 +// CHECK1-NEXT: [[TMP139:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS8]], i32 0, i32 5 +// CHECK1-NEXT: [[TMP140:%.*]] = bitcast i8** [[TMP139]] to %class.anon** +// CHECK1-NEXT: store %class.anon* [[TMP96]], %class.anon** [[TMP140]], align 8 +// CHECK1-NEXT: [[TMP141:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS9]], i64 0, i64 5 +// CHECK1-NEXT: store i8* null, i8** [[TMP141]], align 8 +// CHECK1-NEXT: [[TMP142:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 6 +// CHECK1-NEXT: [[TMP143:%.*]] = bitcast i8** [[TMP142]] to i32*** +// CHECK1-NEXT: store i32** [[TMP97]], i32*** [[TMP143]], align 8 +// CHECK1-NEXT: [[TMP144:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS8]], i32 0, i32 6 +// CHECK1-NEXT: [[TMP145:%.*]] = bitcast i8** [[TMP144]] to i32** +// CHECK1-NEXT: store i32* [[TMP99]], i32** [[TMP145]], align 8 +// CHECK1-NEXT: [[TMP146:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS9]], i64 0, i64 6 +// CHECK1-NEXT: store i8* null, i8** [[TMP146]], align 8 +// CHECK1-NEXT: [[TMP147:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 7 +// CHECK1-NEXT: [[TMP148:%.*]] = bitcast i8** [[TMP147]] to i32*** +// CHECK1-NEXT: store i32** [[TMP100]], i32*** [[TMP148]], align 8 +// CHECK1-NEXT: [[TMP149:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS8]], i32 0, i32 7 +// CHECK1-NEXT: [[TMP150:%.*]] = bitcast i8** [[TMP149]] to i32** +// CHECK1-NEXT: store i32* [[TMP102]], i32** [[TMP150]], align 8 +// CHECK1-NEXT: [[TMP151:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS9]], i64 0, i64 7 +// CHECK1-NEXT: store i8* null, i8** [[TMP151]], align 8 +// CHECK1-NEXT: [[TMP152:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 8 +// CHECK1-NEXT: [[TMP153:%.*]] = bitcast i8** [[TMP152]] to i32*** +// CHECK1-NEXT: store i32** [[TMP103]], i32*** [[TMP153]], align 8 +// CHECK1-NEXT: [[TMP154:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS8]], i32 0, i32 8 +// CHECK1-NEXT: [[TMP155:%.*]] = bitcast i8** [[TMP154]] to i32** +// CHECK1-NEXT: store i32* [[TMP105]], i32** [[TMP155]], align 8 +// CHECK1-NEXT: [[TMP156:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS9]], i64 0, i64 8 +// CHECK1-NEXT: store i8* null, i8** [[TMP156]], align 8 +// CHECK1-NEXT: [[TMP157:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 9 +// CHECK1-NEXT: [[TMP158:%.*]] = bitcast i8** [[TMP157]] to i32**** +// CHECK1-NEXT: store i32*** [[TMP106]], i32**** [[TMP158]], align 8 +// CHECK1-NEXT: [[TMP159:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS8]], i32 0, i32 9 +// CHECK1-NEXT: [[TMP160:%.*]] = bitcast i8** [[TMP159]] to i32*** +// CHECK1-NEXT: store i32** [[TMP108]], i32*** [[TMP160]], align 8 +// CHECK1-NEXT: [[TMP161:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS9]], i64 0, i64 9 +// CHECK1-NEXT: store i8* null, i8** [[TMP161]], align 8 +// CHECK1-NEXT: [[TMP162:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 10 +// CHECK1-NEXT: [[TMP163:%.*]] = bitcast i8** [[TMP162]] to i32*** +// CHECK1-NEXT: store i32** [[TMP109]], i32*** [[TMP163]], align 8 +// CHECK1-NEXT: [[TMP164:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS8]], i32 0, i32 10 +// CHECK1-NEXT: [[TMP165:%.*]] = bitcast i8** [[TMP164]] to i32** +// CHECK1-NEXT: store i32* [[TMP111]], i32** [[TMP165]], align 8 +// CHECK1-NEXT: [[TMP166:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_MAPPERS9]], i64 0, i64 10 +// CHECK1-NEXT: store i8* null, i8** [[TMP166]], align 8 +// CHECK1-NEXT: [[TMP167:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP168:%.*]] = getelementptr inbounds [11 x i8*], [11 x i8*]* [[DOTOFFLOAD_PTRS8]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP169:%.*]] = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @[[GLOB1]], i64 -1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l43.region_id, i32 11, i8** [[TMP167]], i8** [[TMP168]], i64* getelementptr inbounds ([11 x i64], [11 x i64]* @.offload_sizes.1, i32 0, i32 0), i64* getelementptr inbounds ([11 x i64], [11 x i64]* @.offload_maptypes.2, i32 0, i32 0), i8** null, i8** null, i32 1, i32 0) +// CHECK1-NEXT: [[TMP170:%.*]] = icmp ne i32 [[TMP169]], 0 +// CHECK1-NEXT: br i1 [[TMP170]], label [[OMP_OFFLOAD_FAILED10:%.*]], label [[OMP_OFFLOAD_CONT11:%.*]] +// CHECK1: omp_offload.failed10: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l43(i32* [[ARGC_ADDR]], i32* [[TMP93]], i32* [[TMP94]], i32* [[TMP95]], i32* [[A]], %class.anon* [[TMP96]]) #[[ATTR4]] +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT11]] +// CHECK1: omp_offload.cont11: +// CHECK1-NEXT: [[TMP171:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK1-NEXT: [[CALL:%.*]] = call signext i32 @_ZN1S3fooEv(%struct.S* nonnull dereferenceable(4) @s) +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP171]], [[CALL]] +// CHECK1-NEXT: ret i32 [[ADD]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41 +// CHECK1-SAME: (i64 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[C:%.*]], i32* [[D:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], %class.anon* nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca %class.anon*, align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[_TMP2:%.*]] = alloca %class.anon*, align 8 +// CHECK1-NEXT: [[L3:%.*]] = alloca [[CLASS_ANON:%.*]], align 8 +// CHECK1-NEXT: [[_TMP4:%.*]] = alloca %class.anon*, align 8 +// CHECK1-NEXT: [[B5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP6:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[C7:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP8:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[B]], i32** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[D]], i32** [[D_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store %class.anon* [[L]], %class.anon** [[L_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load %class.anon*, %class.anon** [[L_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[TMP0]], i32** [[TMP]], align 8 +// CHECK1-NEXT: store i32* [[TMP1]], i32** [[_TMP1]], align 8 +// CHECK1-NEXT: store %class.anon* [[TMP3]], %class.anon** [[_TMP2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load %class.anon*, %class.anon** [[_TMP2]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast %class.anon* [[L3]] to i8* +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast %class.anon* [[TMP4]] to i8* +// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 40, i1 false) +// CHECK1-NEXT: store %class.anon* [[L3]], %class.anon** [[_TMP4]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[TMP]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: store i32 [[TMP8]], i32* [[B5]], align 4 +// CHECK1-NEXT: store i32* [[B5]], i32** [[_TMP6]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32*, i32** [[_TMP1]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4 +// CHECK1-NEXT: store i32 [[TMP10]], i32* [[C7]], align 4 +// CHECK1-NEXT: store i32* [[C7]], i32** [[_TMP8]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = load %class.anon*, %class.anon** [[_TMP4]], align 8 +// CHECK1-NEXT: [[CALL:%.*]] = call i64 @"_ZZ4mainENK3$_0clEv"(%class.anon* nonnull dereferenceable(40) [[TMP11]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l43 +// CHECK1-SAME: (i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[C:%.*]], i32* [[D:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], %class.anon* nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca %class.anon*, align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[_TMP2:%.*]] = alloca %class.anon*, align 8 +// CHECK1-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[B]], i32** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[D]], i32** [[D_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store %class.anon* [[L]], %class.anon** [[L_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load %class.anon*, %class.anon** [[L_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[TMP1]], i32** [[TMP]], align 8 +// CHECK1-NEXT: store i32* [[TMP2]], i32** [[_TMP1]], align 8 +// CHECK1-NEXT: store %class.anon* [[TMP4]], %class.anon** [[_TMP2]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32*, i32** [[_TMP1]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load %class.anon*, %class.anon** [[_TMP2]], align 8 +// CHECK1-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 6, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*, i32*, i32*, i32*, %class.anon*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* [[TMP0]], i32* [[TMP5]], i32* [[TMP6]], i32* [[TMP7]], i32* [[TMP3]], %class.anon* [[TMP8]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@.omp_outlined. +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[C:%.*]], i32* [[D:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], %class.anon* nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca %class.anon*, align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[_TMP2:%.*]] = alloca %class.anon*, align 8 +// CHECK1-NEXT: [[L3:%.*]] = alloca [[CLASS_ANON:%.*]], align 8 +// CHECK1-NEXT: [[_TMP4:%.*]] = alloca %class.anon*, align 8 +// CHECK1-NEXT: [[ARGC5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[B6:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP7:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[C8:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP9:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A10:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[B]], i32** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[D]], i32** [[D_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store %class.anon* [[L]], %class.anon** [[L_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load %class.anon*, %class.anon** [[L_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[TMP1]], i32** [[TMP]], align 8 +// CHECK1-NEXT: store i32* [[TMP2]], i32** [[_TMP1]], align 8 +// CHECK1-NEXT: store %class.anon* [[TMP4]], %class.anon** [[_TMP2]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load %class.anon*, %class.anon** [[_TMP2]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast %class.anon* [[L3]] to i8* +// CHECK1-NEXT: [[TMP7:%.*]] = bitcast %class.anon* [[TMP5]] to i8* +// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 40, i1 false) +// CHECK1-NEXT: store %class.anon* [[L3]], %class.anon** [[_TMP4]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: store i32 [[TMP8]], i32* [[ARGC5]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32*, i32** [[TMP]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4 +// CHECK1-NEXT: store i32 [[TMP10]], i32* [[B6]], align 4 +// CHECK1-NEXT: store i32* [[B6]], i32** [[_TMP7]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32*, i32** [[_TMP1]], align 8 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4 +// CHECK1-NEXT: store i32 [[TMP12]], i32* [[C8]], align 4 +// CHECK1-NEXT: store i32* [[C8]], i32** [[_TMP9]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK1-NEXT: store i32 [[TMP13]], i32* [[A10]], align 4 +// CHECK1-NEXT: [[TMP14:%.*]] = load %class.anon*, %class.anon** [[_TMP4]], align 8 +// CHECK1-NEXT: [[CALL:%.*]] = call i64 @"_ZZ4mainENK3$_0clEv"(%class.anon* nonnull dereferenceable(40) [[TMP14]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_ZN1S3fooEv +// CHECK1-SAME: (%struct.S* nonnull dereferenceable(4) [[THIS:%.*]]) #[[ATTR3:[0-9]+]] comdat align 2 { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8 +// CHECK1-NEXT: [[L:%.*]] = alloca %class.anon.0*, align 8 +// CHECK1-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca %class.anon.0*, align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 8 +// CHECK1-NEXT: [[_TMP2:%.*]] = alloca %class.anon.0*, align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS3:%.*]] = alloca [3 x i8*], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_PTRS4:%.*]] = alloca [3 x i8*], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS5:%.*]] = alloca [3 x i8*], align 8 +// CHECK1-NEXT: store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[REF_TMP]], i32 0, i32 0 +// CHECK1-NEXT: store %struct.S* [[THIS1]], %struct.S** [[TMP0]], align 8 +// CHECK1-NEXT: store %class.anon.0* [[REF_TMP]], %class.anon.0** [[L]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load %class.anon.0*, %class.anon.0** [[L]], align 8 +// CHECK1-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[TMP2]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[TMP2]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8** [[TMP5]] to %struct.S** +// CHECK1-NEXT: store %struct.S* [[THIS1]], %struct.S** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i8** [[TMP7]] to %struct.S** +// CHECK1-NEXT: store %struct.S* [[THIS1]], %struct.S** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CHECK1-NEXT: store i8* null, i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast i8** [[TMP10]] to %class.anon.0** +// CHECK1-NEXT: store %class.anon.0* [[TMP2]], %class.anon.0** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP13:%.*]] = bitcast i8** [[TMP12]] to %class.anon.0** +// CHECK1-NEXT: store %class.anon.0* [[TMP2]], %class.anon.0** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1 +// CHECK1-NEXT: store i8* null, i8** [[TMP14]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP16:%.*]] = bitcast i8** [[TMP15]] to %struct.S*** +// CHECK1-NEXT: store %struct.S** [[TMP3]], %struct.S*** [[TMP16]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP18:%.*]] = bitcast i8** [[TMP17]] to %struct.S*** +// CHECK1-NEXT: store %struct.S** [[TMP4]], %struct.S*** [[TMP18]], align 8 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2 +// CHECK1-NEXT: store i8* null, i8** [[TMP19]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP22:%.*]] = call i32 @__tgt_target_mapper(%struct.ident_t* @[[GLOB1]], i64 -1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27.region_id, i32 3, i8** [[TMP20]], i8** [[TMP21]], i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes.3, i32 0, i32 0), i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes.4, i32 0, i32 0), i8** null, i8** null) +// CHECK1-NEXT: [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0 +// CHECK1-NEXT: br i1 [[TMP23]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// CHECK1: omp_offload.failed: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27(%struct.S* [[THIS1]], %class.anon.0* [[TMP2]]) #[[ATTR4]] +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] +// CHECK1: omp_offload.cont: +// CHECK1-NEXT: [[TMP24:%.*]] = load %class.anon.0*, %class.anon.0** [[L]], align 8 +// CHECK1-NEXT: store %class.anon.0* [[TMP24]], %class.anon.0** [[_TMP2]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP2]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[TMP25]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[TMP25]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP29:%.*]] = bitcast i8** [[TMP28]] to %struct.S** +// CHECK1-NEXT: store %struct.S* [[THIS1]], %struct.S** [[TMP29]], align 8 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP31:%.*]] = bitcast i8** [[TMP30]] to %struct.S** +// CHECK1-NEXT: store %struct.S* [[THIS1]], %struct.S** [[TMP31]], align 8 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS5]], i64 0, i64 0 +// CHECK1-NEXT: store i8* null, i8** [[TMP32]], align 8 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP34:%.*]] = bitcast i8** [[TMP33]] to %class.anon.0** +// CHECK1-NEXT: store %class.anon.0* [[TMP25]], %class.anon.0** [[TMP34]], align 8 +// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS4]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP36:%.*]] = bitcast i8** [[TMP35]] to %class.anon.0** +// CHECK1-NEXT: store %class.anon.0* [[TMP25]], %class.anon.0** [[TMP36]], align 8 +// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS5]], i64 0, i64 1 +// CHECK1-NEXT: store i8* null, i8** [[TMP37]], align 8 +// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP39:%.*]] = bitcast i8** [[TMP38]] to %struct.S*** +// CHECK1-NEXT: store %struct.S** [[TMP26]], %struct.S*** [[TMP39]], align 8 +// CHECK1-NEXT: [[TMP40:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS4]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP41:%.*]] = bitcast i8** [[TMP40]] to %struct.S*** +// CHECK1-NEXT: store %struct.S** [[TMP27]], %struct.S*** [[TMP41]], align 8 +// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS5]], i64 0, i64 2 +// CHECK1-NEXT: store i8* null, i8** [[TMP42]], align 8 +// CHECK1-NEXT: [[TMP43:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP44:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP45:%.*]] = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @[[GLOB1]], i64 -1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29.region_id, i32 3, i8** [[TMP43]], i8** [[TMP44]], i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes.6, i32 0, i32 0), i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes.7, i32 0, i32 0), i8** null, i8** null, i32 1, i32 0) +// CHECK1-NEXT: [[TMP46:%.*]] = icmp ne i32 [[TMP45]], 0 +// CHECK1-NEXT: br i1 [[TMP46]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] +// CHECK1: omp_offload.failed6: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29(%struct.S* [[THIS1]], %class.anon.0* [[TMP25]]) #[[ATTR4]] +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT7]] +// CHECK1: omp_offload.cont7: +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP47:%.*]] = load i32, i32* [[A]], align 4 +// CHECK1-NEXT: [[TMP48:%.*]] = load %class.anon.0*, %class.anon.0** [[L]], align 8 +// CHECK1-NEXT: [[CALL:%.*]] = call signext i32 @_Z3fooIZN1S3fooEvEUlvE_EiRKT_(%class.anon.0* nonnull align 8 dereferenceable(8) [[TMP48]]) +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP47]], [[CALL]] +// CHECK1-NEXT: ret i32 [[ADD]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27 +// CHECK1-SAME: (%struct.S* [[THIS:%.*]], %class.anon.0* nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca %class.anon.0*, align 8 +// CHECK1-NEXT: [[L1:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8 +// CHECK1-NEXT: [[_TMP2:%.*]] = alloca %class.anon.0*, align 8 +// CHECK1-NEXT: store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8 +// CHECK1-NEXT: store %class.anon.0* [[L]], %class.anon.0** [[L_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load %class.anon.0*, %class.anon.0** [[L_ADDR]], align 8 +// CHECK1-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = bitcast %class.anon.0* [[L1]] to i8* +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast %class.anon.0* [[TMP2]] to i8* +// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 8, i1 false) +// CHECK1-NEXT: store %class.anon.0* [[L1]], %class.anon.0** [[_TMP2]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP2]], align 8 +// CHECK1-NEXT: [[CALL:%.*]] = call signext i32 @_ZZN1S3fooEvENKUlvE_clEv(%class.anon.0* nonnull dereferenceable(8) [[TMP5]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_ZZN1S3fooEvENKUlvE_clEv +// CHECK1-SAME: (%class.anon.0* nonnull dereferenceable(8) [[THIS:%.*]]) #[[ATTR3]] comdat align 2 { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK1-NEXT: store %class.anon.0* [[THIS]], %class.anon.0** [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[THIS1:%.*]] = load %class.anon.0*, %class.anon.0** [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[CLASS_ANON_0:%.*]], %class.anon.0* [[THIS1]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP1:%.*]] = load %struct.S*, %struct.S** [[TMP0]], align 8 +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[TMP1]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[A]], align 4 +// CHECK1-NEXT: ret i32 [[TMP2]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29 +// CHECK1-SAME: (%struct.S* [[THIS:%.*]], %class.anon.0* nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca %class.anon.0*, align 8 +// CHECK1-NEXT: store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8 +// CHECK1-NEXT: store %class.anon.0* [[L]], %class.anon.0** [[L_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load %class.anon.0*, %class.anon.0** [[L_ADDR]], align 8 +// CHECK1-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 +// CHECK1-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.S*, %class.anon.0*)* @.omp_outlined..5 to void (i32*, i32*, ...)*), %struct.S* [[TMP0]], %class.anon.0* [[TMP2]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@.omp_outlined..5 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], %struct.S* [[THIS:%.*]], %class.anon.0* nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca %class.anon.0*, align 8 +// CHECK1-NEXT: [[L1:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8 +// CHECK1-NEXT: [[_TMP2:%.*]] = alloca %class.anon.0*, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8 +// CHECK1-NEXT: store %class.anon.0* [[L]], %class.anon.0** [[L_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load %class.anon.0*, %class.anon.0** [[L_ADDR]], align 8 +// CHECK1-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = bitcast %class.anon.0* [[L1]] to i8* +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast %class.anon.0* [[TMP2]] to i8* +// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 8, i1 false) +// CHECK1-NEXT: store %class.anon.0* [[L1]], %class.anon.0** [[_TMP2]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP2]], align 8 +// CHECK1-NEXT: [[CALL:%.*]] = call signext i32 @_ZZN1S3fooEvENKUlvE_clEv(%class.anon.0* nonnull dereferenceable(8) [[TMP5]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_Z3fooIZN1S3fooEvEUlvE_EiRKT_ +// CHECK1-SAME: (%class.anon.0* nonnull align 8 dereferenceable(8) [[T:%.*]]) #[[ATTR3]] comdat { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[T_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca %class.anon.0*, align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store %class.anon.0* [[T]], %class.anon.0** [[T_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load %class.anon.0*, %class.anon.0** [[T_ADDR]], align 8 +// CHECK1-NEXT: store %class.anon.0* [[TMP0]], %class.anon.0** [[TMP]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[CLASS_ANON_0:%.*]], %class.anon.0* [[TMP1]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[TMP1]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8** [[TMP4]] to %class.anon.0** +// CHECK1-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP5]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to %class.anon.0** +// CHECK1-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP7]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CHECK1-NEXT: store i8* null, i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8** [[TMP9]] to %struct.S*** +// CHECK1-NEXT: store %struct.S** [[TMP2]], %struct.S*** [[TMP10]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8** [[TMP11]] to %struct.S*** +// CHECK1-NEXT: store %struct.S** [[TMP3]], %struct.S*** [[TMP12]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1 +// CHECK1-NEXT: store i8* null, i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP16:%.*]] = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @[[GLOB1]], i64 -1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIZN1S3fooEvEUlvE_EiRKT__l18.region_id, i32 2, i8** [[TMP14]], i8** [[TMP15]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.9, i32 0, i32 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.10, i32 0, i32 0), i8** null, i8** null, i32 1, i32 0) +// CHECK1-NEXT: [[TMP17:%.*]] = icmp ne i32 [[TMP16]], 0 +// CHECK1-NEXT: br i1 [[TMP17]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// CHECK1: omp_offload.failed: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIZN1S3fooEvEUlvE_EiRKT__l18(%class.anon.0* [[TMP1]]) #[[ATTR4]] +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] +// CHECK1: omp_offload.cont: +// CHECK1-NEXT: ret i32 0 +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIZN1S3fooEvEUlvE_EiRKT__l18 +// CHECK1-SAME: (%class.anon.0* nonnull align 8 dereferenceable(8) [[T:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[T_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca %class.anon.0*, align 8 +// CHECK1-NEXT: store %class.anon.0* [[T]], %class.anon.0** [[T_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load %class.anon.0*, %class.anon.0** [[T_ADDR]], align 8 +// CHECK1-NEXT: store %class.anon.0* [[TMP0]], %class.anon.0** [[TMP]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 +// CHECK1-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %class.anon.0*)* @.omp_outlined..8 to void (i32*, i32*, ...)*), %class.anon.0* [[TMP1]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@.omp_outlined..8 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], %class.anon.0* nonnull align 8 dereferenceable(8) [[T:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[T_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca %class.anon.0*, align 8 +// CHECK1-NEXT: [[T1:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8 +// CHECK1-NEXT: [[_TMP2:%.*]] = alloca %class.anon.0*, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store %class.anon.0* [[T]], %class.anon.0** [[T_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load %class.anon.0*, %class.anon.0** [[T_ADDR]], align 8 +// CHECK1-NEXT: store %class.anon.0* [[TMP0]], %class.anon.0** [[TMP]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = bitcast %class.anon.0* [[T1]] to i8* +// CHECK1-NEXT: [[TMP3:%.*]] = bitcast %class.anon.0* [[TMP1]] to i8* +// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 8, i1 false) +// CHECK1-NEXT: store %class.anon.0* [[T1]], %class.anon.0** [[_TMP2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP2]], align 8 +// CHECK1-NEXT: [[CALL:%.*]] = call signext i32 @_ZZN1S3fooEvENKUlvE_clEv(%class.anon.0* nonnull dereferenceable(8) [[TMP4]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg +// CHECK1-SAME: () #[[ATTR5:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) +// CHECK1-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27_worker +// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27 +// CHECK2-SAME: (%struct.S* [[THIS:%.*]], %class.anon* nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca %class.anon*, align 8 +// CHECK2-NEXT: [[TMP:%.*]] = alloca %class.anon*, align 8 +// CHECK2-NEXT: [[L7:%.*]] = alloca [[CLASS_ANON:%.*]], align 8 +// CHECK2-NEXT: [[_TMP8:%.*]] = alloca %class.anon*, align 8 +// CHECK2-NEXT: store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8 +// CHECK2-NEXT: store %class.anon* [[L]], %class.anon** [[L_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load %class.anon*, %class.anon** [[L_ADDR]], align 8 +// CHECK2-NEXT: store %class.anon* [[TMP1]], %class.anon** [[TMP]], align 8 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP2:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP2]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27_worker() #[[ATTR6:[0-9]+]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP4:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP4]], [[TMP5]] +// CHECK2-NEXT: [[TMP6:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP6]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: [[TMP7:%.*]] = load %class.anon*, %class.anon** [[TMP]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast %class.anon* [[L7]] to i8* +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast %class.anon* [[TMP7]] to i8* +// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 8, i1 false) +// CHECK2-NEXT: store %class.anon* [[L7]], %class.anon** [[_TMP8]], align 8 +// CHECK2-NEXT: [[TMP10:%.*]] = load %class.anon*, %class.anon** [[_TMP8]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP10]], i32 0, i32 0 +// CHECK2-NEXT: store %struct.S* [[TMP0]], %struct.S** [[TMP11]], align 8 +// CHECK2-NEXT: [[TMP12:%.*]] = load %class.anon*, %class.anon** [[_TMP8]], align 8 +// CHECK2-NEXT: [[CALL:%.*]] = call i32 @_ZZN1S3fooEvENKUlvE_clEv(%class.anon* nonnull dereferenceable(8) [[TMP12]]) #[[ATTR7:[0-9]+]] +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_ZZN1S3fooEvENKUlvE_clEv +// CHECK2-SAME: (%class.anon* nonnull dereferenceable(8) [[THIS:%.*]]) #[[ATTR4:[0-9]+]] comdat align 2 { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %class.anon*, align 8 +// CHECK2-NEXT: store %class.anon* [[THIS]], %class.anon** [[THIS_ADDR]], align 8 +// CHECK2-NEXT: [[THIS1:%.*]] = load %class.anon*, %class.anon** [[THIS_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[CLASS_ANON:%.*]], %class.anon* [[THIS1]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP1:%.*]] = load %struct.S*, %struct.S** [[TMP0]], align 8 +// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[TMP1]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[A]], align 4 +// CHECK2-NEXT: ret i32 [[TMP2]] +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29 +// CHECK2-SAME: (%struct.S* [[THIS:%.*]], %class.anon* nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca %class.anon*, align 8 +// CHECK2-NEXT: [[TMP:%.*]] = alloca %class.anon*, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 +// CHECK2-NEXT: store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8 +// CHECK2-NEXT: store %class.anon* [[L]], %class.anon** [[L_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load %class.anon*, %class.anon** [[L_ADDR]], align 8 +// CHECK2-NEXT: store %class.anon* [[TMP1]], %class.anon** [[TMP]], align 8 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK2-NEXT: [[TMP3:%.*]] = load %class.anon*, %class.anon** [[TMP]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast %struct.S* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast %class.anon* [[TMP3]] to i8* +// CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, %struct.S*, %class.anon*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP8]], i64 2) +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], %struct.S* [[THIS:%.*]], %class.anon* nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca %class.anon*, align 8 +// CHECK2-NEXT: [[TMP:%.*]] = alloca %class.anon*, align 8 +// CHECK2-NEXT: [[L1:%.*]] = alloca [[CLASS_ANON:%.*]], align 8 +// CHECK2-NEXT: [[_TMP2:%.*]] = alloca %class.anon*, align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8 +// CHECK2-NEXT: store %class.anon* [[L]], %class.anon** [[L_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load %class.anon*, %class.anon** [[L_ADDR]], align 8 +// CHECK2-NEXT: store %class.anon* [[TMP1]], %class.anon** [[TMP]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load %class.anon*, %class.anon** [[TMP]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = bitcast %class.anon* [[L1]] to i8* +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast %class.anon* [[TMP2]] to i8* +// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 8, i1 false) +// CHECK2-NEXT: store %class.anon* [[L1]], %class.anon** [[_TMP2]], align 8 +// CHECK2-NEXT: [[TMP5:%.*]] = load %class.anon*, %class.anon** [[_TMP2]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP5]], i32 0, i32 0 +// CHECK2-NEXT: store %struct.S* [[TMP0]], %struct.S** [[TMP6]], align 8 +// CHECK2-NEXT: [[TMP7:%.*]] = load %class.anon*, %class.anon** [[_TMP2]], align 8 +// CHECK2-NEXT: [[CALL:%.*]] = call i32 @_ZZN1S3fooEvENKUlvE_clEv(%class.anon* nonnull dereferenceable(8) [[TMP7]]) #[[ATTR7]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_worker +// CHECK2-SAME: () #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41 +// CHECK2-SAME: (i64 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[C:%.*]], i32* [[D:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], %class.anon.0* nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[_TMP2:%.*]] = alloca %class.anon.0*, align 8 +// CHECK2-NEXT: [[L9:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8 +// CHECK2-NEXT: [[_TMP10:%.*]] = alloca %class.anon.0*, align 8 +// CHECK2-NEXT: [[B11:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP12:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[C13:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP14:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[B]], i32** [[B_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[D]], i32** [[D_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: store %class.anon.0* [[L]], %class.anon.0** [[L_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load %class.anon.0*, %class.anon.0** [[L_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[TMP0]], i32** [[TMP]], align 8 +// CHECK2-NEXT: store i32* [[TMP1]], i32** [[_TMP1]], align 8 +// CHECK2-NEXT: store %class.anon.0* [[TMP3]], %class.anon.0** [[_TMP2]], align 8 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_worker() #[[ATTR6]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP5:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 +// CHECK2-NEXT: [[TMP6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 +// CHECK2-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP8:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP8]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: [[TMP9:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP2]], align 8 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast %class.anon.0* [[L9]] to i8* +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast %class.anon.0* [[TMP9]] to i8* +// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP10]], i8* align 8 [[TMP11]], i64 40, i1 false) +// CHECK2-NEXT: store %class.anon.0* [[L9]], %class.anon.0** [[_TMP10]], align 8 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32*, i32** [[TMP]], align 8 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK2-NEXT: store i32 [[TMP13]], i32* [[B11]], align 4 +// CHECK2-NEXT: store i32* [[B11]], i32** [[_TMP12]], align 8 +// CHECK2-NEXT: [[TMP14:%.*]] = load i32*, i32** [[_TMP1]], align 8 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4 +// CHECK2-NEXT: store i32 [[TMP15]], i32* [[C13]], align 4 +// CHECK2-NEXT: store i32* [[C13]], i32** [[_TMP14]], align 8 +// CHECK2-NEXT: [[TMP16:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP10]], align 8 +// CHECK2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[TMP16]], i32 0, i32 0 +// CHECK2-NEXT: store i32* [[CONV]], i32** [[TMP17]], align 8 +// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[TMP16]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32*, i32** [[_TMP12]], align 8 +// CHECK2-NEXT: store i32* [[TMP19]], i32** [[TMP18]], align 8 +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[TMP16]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP21:%.*]] = load i32*, i32** [[_TMP14]], align 8 +// CHECK2-NEXT: store i32* [[TMP21]], i32** [[TMP20]], align 8 +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[TMP16]], i32 0, i32 3 +// CHECK2-NEXT: store i32** [[D_ADDR]], i32*** [[TMP22]], align 8 +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[TMP16]], i32 0, i32 4 +// CHECK2-NEXT: store i32* [[TMP2]], i32** [[TMP23]], align 8 +// CHECK2-NEXT: [[TMP24:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP10]], align 8 +// CHECK2-NEXT: [[CALL:%.*]] = call i64 @"_ZZ4mainENK3$_0clEv"(%class.anon.0* nonnull dereferenceable(40) [[TMP24]]) #[[ATTR7]] +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l43 +// CHECK2-SAME: (i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[C:%.*]], i32* [[D:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], %class.anon.0* nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[_TMP2:%.*]] = alloca %class.anon.0*, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [6 x i8*], align 8 +// CHECK2-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[B]], i32** [[B_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[D]], i32** [[D_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: store %class.anon.0* [[L]], %class.anon.0** [[L_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load %class.anon.0*, %class.anon.0** [[L_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[TMP1]], i32** [[TMP]], align 8 +// CHECK2-NEXT: store i32* [[TMP2]], i32** [[_TMP1]], align 8 +// CHECK2-NEXT: store %class.anon.0* [[TMP4]], %class.anon.0** [[_TMP2]], align 8 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[TMP]], align 8 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[_TMP1]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32*, i32** [[D_ADDR]], align 8 +// CHECK2-NEXT: [[TMP9:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP2]], align 8 +// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8 +// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP6]] to i8* +// CHECK2-NEXT: store i8* [[TMP13]], i8** [[TMP12]], align 8 +// CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK2-NEXT: store i8* [[TMP15]], i8** [[TMP14]], align 8 +// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP8]] to i8* +// CHECK2-NEXT: store i8* [[TMP17]], i8** [[TMP16]], align 8 +// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP3]] to i8* +// CHECK2-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 8 +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 5 +// CHECK2-NEXT: [[TMP21:%.*]] = bitcast %class.anon.0* [[TMP9]] to i8* +// CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK2-NEXT: [[TMP22:%.*]] = bitcast [6 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*, i32*, i32*, i32*, %class.anon.0*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP22]], i64 6) +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[C:%.*]], i32* [[D:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], %class.anon.0* nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[_TMP2:%.*]] = alloca %class.anon.0*, align 8 +// CHECK2-NEXT: [[L3:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8 +// CHECK2-NEXT: [[_TMP4:%.*]] = alloca %class.anon.0*, align 8 +// CHECK2-NEXT: [[ARGC5:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B6:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP7:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[C8:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP9:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[A10:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[B]], i32** [[B_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[D]], i32** [[D_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: store %class.anon.0* [[L]], %class.anon.0** [[L_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load %class.anon.0*, %class.anon.0** [[L_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[TMP1]], i32** [[TMP]], align 8 +// CHECK2-NEXT: store i32* [[TMP2]], i32** [[_TMP1]], align 8 +// CHECK2-NEXT: store %class.anon.0* [[TMP4]], %class.anon.0** [[_TMP2]], align 8 +// CHECK2-NEXT: [[TMP5:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP2]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast %class.anon.0* [[L3]] to i8* +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast %class.anon.0* [[TMP5]] to i8* +// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 40, i1 false) +// CHECK2-NEXT: store %class.anon.0* [[L3]], %class.anon.0** [[_TMP4]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: store i32 [[TMP8]], i32* [[ARGC5]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32*, i32** [[TMP]], align 8 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4 +// CHECK2-NEXT: store i32 [[TMP10]], i32* [[B6]], align 4 +// CHECK2-NEXT: store i32* [[B6]], i32** [[_TMP7]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32*, i32** [[_TMP1]], align 8 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4 +// CHECK2-NEXT: store i32 [[TMP12]], i32* [[C8]], align 4 +// CHECK2-NEXT: store i32* [[C8]], i32** [[_TMP9]], align 8 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK2-NEXT: store i32 [[TMP13]], i32* [[A10]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP4]], align 8 +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[TMP14]], i32 0, i32 0 +// CHECK2-NEXT: store i32* [[ARGC5]], i32** [[TMP15]], align 8 +// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[TMP14]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32*, i32** [[_TMP7]], align 8 +// CHECK2-NEXT: store i32* [[TMP17]], i32** [[TMP16]], align 8 +// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[TMP14]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32*, i32** [[_TMP9]], align 8 +// CHECK2-NEXT: store i32* [[TMP19]], i32** [[TMP18]], align 8 +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[TMP14]], i32 0, i32 3 +// CHECK2-NEXT: store i32** [[D_ADDR]], i32*** [[TMP20]], align 8 +// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[TMP14]], i32 0, i32 4 +// CHECK2-NEXT: store i32* [[A10]], i32** [[TMP21]], align 8 +// CHECK2-NEXT: [[TMP22:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP4]], align 8 +// CHECK2-NEXT: [[CALL:%.*]] = call i64 @"_ZZ4mainENK3$_0clEv"(%class.anon.0* nonnull dereferenceable(40) [[TMP22]]) #[[ATTR7]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIZN1S3fooEvEUlvE_EiRKT__l18 +// CHECK2-SAME: (%class.anon* nonnull align 8 dereferenceable(8) [[T:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[T_ADDR:%.*]] = alloca %class.anon*, align 8 +// CHECK2-NEXT: [[TMP:%.*]] = alloca %class.anon*, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK2-NEXT: store %class.anon* [[T]], %class.anon** [[T_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load %class.anon*, %class.anon** [[T_ADDR]], align 8 +// CHECK2-NEXT: store %class.anon* [[TMP0]], %class.anon** [[TMP]], align 8 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load %class.anon*, %class.anon** [[TMP]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast %class.anon* [[TMP2]] to i8* +// CHECK2-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, %class.anon*)* @__omp_outlined__2 to i8*), i8* null, i8** [[TMP5]], i64 1) +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], %class.anon* nonnull align 8 dereferenceable(8) [[T:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[T_ADDR:%.*]] = alloca %class.anon*, align 8 +// CHECK2-NEXT: [[TMP:%.*]] = alloca %class.anon*, align 8 +// CHECK2-NEXT: [[T1:%.*]] = alloca [[CLASS_ANON:%.*]], align 8 +// CHECK2-NEXT: [[_TMP2:%.*]] = alloca %class.anon*, align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store %class.anon* [[T]], %class.anon** [[T_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load %class.anon*, %class.anon** [[T_ADDR]], align 8 +// CHECK2-NEXT: store %class.anon* [[TMP0]], %class.anon** [[TMP]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load %class.anon*, %class.anon** [[TMP]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = bitcast %class.anon* [[T1]] to i8* +// CHECK2-NEXT: [[TMP3:%.*]] = bitcast %class.anon* [[TMP1]] to i8* +// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 8, i1 false) +// CHECK2-NEXT: store %class.anon* [[T1]], %class.anon** [[_TMP2]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load %class.anon*, %class.anon** [[_TMP2]], align 8 +// CHECK2-NEXT: [[TMP5:%.*]] = load %class.anon*, %class.anon** [[_TMP2]], align 8 +// CHECK2-NEXT: [[CALL:%.*]] = call i32 @_ZZN1S3fooEvENKUlvE_clEv(%class.anon* nonnull dereferenceable(8) [[TMP5]]) #[[ATTR7]] +// CHECK2-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_worker +// CHECK3-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41 +// CHECK3-SAME: (i64 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[C:%.*]], i32* [[D:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], %class.anon* nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca %class.anon*, align 8 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[_TMP2:%.*]] = alloca %class.anon*, align 8 +// CHECK3-NEXT: [[L9:%.*]] = alloca [[CLASS_ANON:%.*]], align 8 +// CHECK3-NEXT: [[_TMP10:%.*]] = alloca %class.anon*, align 8 +// CHECK3-NEXT: [[B11:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP12:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[C13:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP14:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK3-NEXT: store i32* [[B]], i32** [[B_ADDR]], align 8 +// CHECK3-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8 +// CHECK3-NEXT: store i32* [[D]], i32** [[D_ADDR]], align 8 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK3-NEXT: store %class.anon* [[L]], %class.anon** [[L_ADDR]], align 8 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[B_ADDR]], align 8 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[C_ADDR]], align 8 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK3-NEXT: [[TMP3:%.*]] = load %class.anon*, %class.anon** [[L_ADDR]], align 8 +// CHECK3-NEXT: store i32* [[TMP0]], i32** [[TMP]], align 8 +// CHECK3-NEXT: store i32* [[TMP1]], i32** [[_TMP1]], align 8 +// CHECK3-NEXT: store %class.anon* [[TMP3]], %class.anon** [[_TMP2]], align 8 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP4:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_worker() #[[ATTR6:[0-9]+]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP5:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 +// CHECK3-NEXT: [[TMP6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 +// CHECK3-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP8:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP8]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK3-NEXT: [[TMP9:%.*]] = load %class.anon*, %class.anon** [[_TMP2]], align 8 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast %class.anon* [[L9]] to i8* +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast %class.anon* [[TMP9]] to i8* +// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP10]], i8* align 8 [[TMP11]], i64 40, i1 false) +// CHECK3-NEXT: store %class.anon* [[L9]], %class.anon** [[_TMP10]], align 8 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32*, i32** [[TMP]], align 8 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK3-NEXT: store i32 [[TMP13]], i32* [[B11]], align 4 +// CHECK3-NEXT: store i32* [[B11]], i32** [[_TMP12]], align 8 +// CHECK3-NEXT: [[TMP14:%.*]] = load i32*, i32** [[_TMP1]], align 8 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4 +// CHECK3-NEXT: store i32 [[TMP15]], i32* [[C13]], align 4 +// CHECK3-NEXT: store i32* [[C13]], i32** [[_TMP14]], align 8 +// CHECK3-NEXT: [[TMP16:%.*]] = load %class.anon*, %class.anon** [[_TMP10]], align 8 +// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP16]], i32 0, i32 0 +// CHECK3-NEXT: store i32* [[CONV]], i32** [[TMP17]], align 8 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP16]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32*, i32** [[_TMP12]], align 8 +// CHECK3-NEXT: store i32* [[TMP19]], i32** [[TMP18]], align 8 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP16]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32*, i32** [[_TMP14]], align 8 +// CHECK3-NEXT: store i32* [[TMP21]], i32** [[TMP20]], align 8 +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP16]], i32 0, i32 3 +// CHECK3-NEXT: store i32** [[D_ADDR]], i32*** [[TMP22]], align 8 +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP16]], i32 0, i32 4 +// CHECK3-NEXT: store i32* [[TMP2]], i32** [[TMP23]], align 8 +// CHECK3-NEXT: [[TMP24:%.*]] = load %class.anon*, %class.anon** [[_TMP10]], align 8 +// CHECK3-NEXT: [[CALL:%.*]] = call i64 @"_ZZ4mainENK3$_0clEv"(%class.anon* nonnull dereferenceable(40) [[TMP24]]) #[[ATTR7:[0-9]+]] +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l43 +// CHECK3-SAME: (i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[C:%.*]], i32* [[D:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], %class.anon* nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca %class.anon*, align 8 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[_TMP2:%.*]] = alloca %class.anon*, align 8 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [6 x i8*], align 8 +// CHECK3-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 +// CHECK3-NEXT: store i32* [[B]], i32** [[B_ADDR]], align 8 +// CHECK3-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8 +// CHECK3-NEXT: store i32* [[D]], i32** [[D_ADDR]], align 8 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK3-NEXT: store %class.anon* [[L]], %class.anon** [[L_ADDR]], align 8 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[B_ADDR]], align 8 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32*, i32** [[C_ADDR]], align 8 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK3-NEXT: [[TMP4:%.*]] = load %class.anon*, %class.anon** [[L_ADDR]], align 8 +// CHECK3-NEXT: store i32* [[TMP1]], i32** [[TMP]], align 8 +// CHECK3-NEXT: store i32* [[TMP2]], i32** [[_TMP1]], align 8 +// CHECK3-NEXT: store %class.anon* [[TMP4]], %class.anon** [[_TMP2]], align 8 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK3-NEXT: [[TMP6:%.*]] = load i32*, i32** [[TMP]], align 8 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[_TMP1]], align 8 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32*, i32** [[D_ADDR]], align 8 +// CHECK3-NEXT: [[TMP9:%.*]] = load %class.anon*, %class.anon** [[_TMP2]], align 8 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8 +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK3-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP6]] to i8* +// CHECK3-NEXT: store i8* [[TMP13]], i8** [[TMP12]], align 8 +// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK3-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP15]], i8** [[TMP14]], align 8 +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK3-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP8]] to i8* +// CHECK3-NEXT: store i8* [[TMP17]], i8** [[TMP16]], align 8 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK3-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP3]] to i8* +// CHECK3-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 8 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 5 +// CHECK3-NEXT: [[TMP21:%.*]] = bitcast %class.anon* [[TMP9]] to i8* +// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK3-NEXT: [[TMP22:%.*]] = bitcast [6 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*, i32*, i32*, i32*, %class.anon*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP22]], i64 6) +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[C:%.*]], i32* [[D:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], %class.anon* nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca %class.anon*, align 8 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[_TMP2:%.*]] = alloca %class.anon*, align 8 +// CHECK3-NEXT: [[L3:%.*]] = alloca [[CLASS_ANON:%.*]], align 8 +// CHECK3-NEXT: [[_TMP4:%.*]] = alloca %class.anon*, align 8 +// CHECK3-NEXT: [[ARGC5:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B6:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP7:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[C8:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP9:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[A10:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK3-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 +// CHECK3-NEXT: store i32* [[B]], i32** [[B_ADDR]], align 8 +// CHECK3-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8 +// CHECK3-NEXT: store i32* [[D]], i32** [[D_ADDR]], align 8 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK3-NEXT: store %class.anon* [[L]], %class.anon** [[L_ADDR]], align 8 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[B_ADDR]], align 8 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32*, i32** [[C_ADDR]], align 8 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK3-NEXT: [[TMP4:%.*]] = load %class.anon*, %class.anon** [[L_ADDR]], align 8 +// CHECK3-NEXT: store i32* [[TMP1]], i32** [[TMP]], align 8 +// CHECK3-NEXT: store i32* [[TMP2]], i32** [[_TMP1]], align 8 +// CHECK3-NEXT: store %class.anon* [[TMP4]], %class.anon** [[_TMP2]], align 8 +// CHECK3-NEXT: [[TMP5:%.*]] = load %class.anon*, %class.anon** [[_TMP2]], align 8 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast %class.anon* [[L3]] to i8* +// CHECK3-NEXT: [[TMP7:%.*]] = bitcast %class.anon* [[TMP5]] to i8* +// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 40, i1 false) +// CHECK3-NEXT: store %class.anon* [[L3]], %class.anon** [[_TMP4]], align 8 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK3-NEXT: store i32 [[TMP8]], i32* [[ARGC5]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32*, i32** [[TMP]], align 8 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4 +// CHECK3-NEXT: store i32 [[TMP10]], i32* [[B6]], align 4 +// CHECK3-NEXT: store i32* [[B6]], i32** [[_TMP7]], align 8 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32*, i32** [[_TMP1]], align 8 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4 +// CHECK3-NEXT: store i32 [[TMP12]], i32* [[C8]], align 4 +// CHECK3-NEXT: store i32* [[C8]], i32** [[_TMP9]], align 8 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK3-NEXT: store i32 [[TMP13]], i32* [[A10]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = load %class.anon*, %class.anon** [[_TMP4]], align 8 +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP14]], i32 0, i32 0 +// CHECK3-NEXT: store i32* [[ARGC5]], i32** [[TMP15]], align 8 +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP14]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32*, i32** [[_TMP7]], align 8 +// CHECK3-NEXT: store i32* [[TMP17]], i32** [[TMP16]], align 8 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP14]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32*, i32** [[_TMP9]], align 8 +// CHECK3-NEXT: store i32* [[TMP19]], i32** [[TMP18]], align 8 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP14]], i32 0, i32 3 +// CHECK3-NEXT: store i32** [[D_ADDR]], i32*** [[TMP20]], align 8 +// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP14]], i32 0, i32 4 +// CHECK3-NEXT: store i32* [[A10]], i32** [[TMP21]], align 8 +// CHECK3-NEXT: [[TMP22:%.*]] = load %class.anon*, %class.anon** [[_TMP4]], align 8 +// CHECK3-NEXT: [[CALL:%.*]] = call i64 @"_ZZ4mainENK3$_0clEv"(%class.anon* nonnull dereferenceable(40) [[TMP22]]) #[[ATTR7]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27_worker +// CHECK3-SAME: () #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27 +// CHECK3-SAME: (%struct.S* [[THIS:%.*]], %class.anon.0* nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK3-NEXT: [[TMP:%.*]] = alloca %class.anon.0*, align 8 +// CHECK3-NEXT: [[L7:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8 +// CHECK3-NEXT: [[_TMP8:%.*]] = alloca %class.anon.0*, align 8 +// CHECK3-NEXT: store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8 +// CHECK3-NEXT: store %class.anon.0* [[L]], %class.anon.0** [[L_ADDR]], align 8 +// CHECK3-NEXT: [[TMP0:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8 +// CHECK3-NEXT: [[TMP1:%.*]] = load %class.anon.0*, %class.anon.0** [[L_ADDR]], align 8 +// CHECK3-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP]], align 8 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP2:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP2]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27_worker() #[[ATTR6]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK3-NEXT: [[TMP4:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK3-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP4]], [[TMP5]] +// CHECK3-NEXT: [[TMP6:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP6]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK3-NEXT: [[TMP7:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast %class.anon.0* [[L7]] to i8* +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast %class.anon.0* [[TMP7]] to i8* +// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 8, i1 false) +// CHECK3-NEXT: store %class.anon.0* [[L7]], %class.anon.0** [[_TMP8]], align 8 +// CHECK3-NEXT: [[TMP10:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP8]], align 8 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[TMP10]], i32 0, i32 0 +// CHECK3-NEXT: store %struct.S* [[TMP0]], %struct.S** [[TMP11]], align 8 +// CHECK3-NEXT: [[TMP12:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP8]], align 8 +// CHECK3-NEXT: [[CALL:%.*]] = call i32 @_ZZN1S3fooEvENKUlvE_clEv(%class.anon.0* nonnull dereferenceable(8) [[TMP12]]) #[[ATTR7]] +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_ZZN1S3fooEvENKUlvE_clEv +// CHECK3-SAME: (%class.anon.0* nonnull dereferenceable(8) [[THIS:%.*]]) #[[ATTR4:[0-9]+]] comdat align 2 { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK3-NEXT: store %class.anon.0* [[THIS]], %class.anon.0** [[THIS_ADDR]], align 8 +// CHECK3-NEXT: [[THIS1:%.*]] = load %class.anon.0*, %class.anon.0** [[THIS_ADDR]], align 8 +// CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[CLASS_ANON_0:%.*]], %class.anon.0* [[THIS1]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP1:%.*]] = load %struct.S*, %struct.S** [[TMP0]], align 8 +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[TMP1]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[A]], align 4 +// CHECK3-NEXT: ret i32 [[TMP2]] +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29 +// CHECK3-SAME: (%struct.S* [[THIS:%.*]], %class.anon.0* nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK3-NEXT: [[TMP:%.*]] = alloca %class.anon.0*, align 8 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 +// CHECK3-NEXT: store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8 +// CHECK3-NEXT: store %class.anon.0* [[L]], %class.anon.0** [[L_ADDR]], align 8 +// CHECK3-NEXT: [[TMP0:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8 +// CHECK3-NEXT: [[TMP1:%.*]] = load %class.anon.0*, %class.anon.0** [[L_ADDR]], align 8 +// CHECK3-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP]], align 8 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK3-NEXT: [[TMP3:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 +// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast %struct.S* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK3-NEXT: [[TMP7:%.*]] = bitcast %class.anon.0* [[TMP3]] to i8* +// CHECK3-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, %struct.S*, %class.anon.0*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP8]], i64 2) +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], %struct.S* [[THIS:%.*]], %class.anon.0* nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK3-NEXT: [[TMP:%.*]] = alloca %class.anon.0*, align 8 +// CHECK3-NEXT: [[L1:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8 +// CHECK3-NEXT: [[_TMP2:%.*]] = alloca %class.anon.0*, align 8 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK3-NEXT: store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8 +// CHECK3-NEXT: store %class.anon.0* [[L]], %class.anon.0** [[L_ADDR]], align 8 +// CHECK3-NEXT: [[TMP0:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8 +// CHECK3-NEXT: [[TMP1:%.*]] = load %class.anon.0*, %class.anon.0** [[L_ADDR]], align 8 +// CHECK3-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP]], align 8 +// CHECK3-NEXT: [[TMP2:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 +// CHECK3-NEXT: [[TMP3:%.*]] = bitcast %class.anon.0* [[L1]] to i8* +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast %class.anon.0* [[TMP2]] to i8* +// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 8, i1 false) +// CHECK3-NEXT: store %class.anon.0* [[L1]], %class.anon.0** [[_TMP2]], align 8 +// CHECK3-NEXT: [[TMP5:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP2]], align 8 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: store %struct.S* [[TMP0]], %struct.S** [[TMP6]], align 8 +// CHECK3-NEXT: [[TMP7:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP2]], align 8 +// CHECK3-NEXT: [[CALL:%.*]] = call i32 @_ZZN1S3fooEvENKUlvE_clEv(%class.anon.0* nonnull dereferenceable(8) [[TMP7]]) #[[ATTR7]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIZN1S3fooEvEUlvE_EiRKT__l18 +// CHECK3-SAME: (%class.anon.0* nonnull align 8 dereferenceable(8) [[T:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[T_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK3-NEXT: [[TMP:%.*]] = alloca %class.anon.0*, align 8 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK3-NEXT: store %class.anon.0* [[T]], %class.anon.0** [[T_ADDR]], align 8 +// CHECK3-NEXT: [[TMP0:%.*]] = load %class.anon.0*, %class.anon.0** [[T_ADDR]], align 8 +// CHECK3-NEXT: store %class.anon.0* [[TMP0]], %class.anon.0** [[TMP]], align 8 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 +// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast %class.anon.0* [[TMP2]] to i8* +// CHECK3-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, %class.anon.0*)* @__omp_outlined__2 to i8*), i8* null, i8** [[TMP5]], i64 1) +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], %class.anon.0* nonnull align 8 dereferenceable(8) [[T:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[T_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK3-NEXT: [[TMP:%.*]] = alloca %class.anon.0*, align 8 +// CHECK3-NEXT: [[T1:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8 +// CHECK3-NEXT: [[_TMP2:%.*]] = alloca %class.anon.0*, align 8 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK3-NEXT: store %class.anon.0* [[T]], %class.anon.0** [[T_ADDR]], align 8 +// CHECK3-NEXT: [[TMP0:%.*]] = load %class.anon.0*, %class.anon.0** [[T_ADDR]], align 8 +// CHECK3-NEXT: store %class.anon.0* [[TMP0]], %class.anon.0** [[TMP]], align 8 +// CHECK3-NEXT: [[TMP1:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 +// CHECK3-NEXT: [[TMP2:%.*]] = bitcast %class.anon.0* [[T1]] to i8* +// CHECK3-NEXT: [[TMP3:%.*]] = bitcast %class.anon.0* [[TMP1]] to i8* +// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 8, i1 false) +// CHECK3-NEXT: store %class.anon.0* [[T1]], %class.anon.0** [[_TMP2]], align 8 +// CHECK3-NEXT: [[TMP4:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP2]], align 8 +// CHECK3-NEXT: [[TMP5:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP2]], align 8 +// CHECK3-NEXT: [[CALL:%.*]] = call i32 @_ZZN1S3fooEvENKUlvE_clEv(%class.anon.0* nonnull dereferenceable(8) [[TMP5]]) #[[ATTR7]] +// CHECK3-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_worker +// CHECK4-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK4: .await.work: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK4: .select.workers: +// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK4: .execute.parallel: +// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK4-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK4: .terminate.parallel: +// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK4: .barrier.parallel: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41 +// CHECK4-SAME: (i64 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[C:%.*]], i32* [[D:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], %class.anon* nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca %class.anon*, align 8 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[_TMP2:%.*]] = alloca %class.anon*, align 8 +// CHECK4-NEXT: [[L9:%.*]] = alloca [[CLASS_ANON:%.*]], align 8 +// CHECK4-NEXT: [[_TMP10:%.*]] = alloca %class.anon*, align 8 +// CHECK4-NEXT: [[B11:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP12:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[C13:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP14:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[B]], i32** [[B_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[D]], i32** [[D_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: store %class.anon* [[L]], %class.anon** [[L_ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[C_ADDR]], align 8 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: [[TMP3:%.*]] = load %class.anon*, %class.anon** [[L_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[TMP0]], i32** [[TMP]], align 8 +// CHECK4-NEXT: store i32* [[TMP1]], i32** [[_TMP1]], align 8 +// CHECK4-NEXT: store %class.anon* [[TMP3]], %class.anon** [[_TMP2]], align 8 +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK4-NEXT: [[TMP4:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK4-NEXT: br i1 [[TMP4]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK4: .worker: +// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_worker() #[[ATTR6:[0-9]+]] +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .mastercheck: +// CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP5:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 +// CHECK4-NEXT: [[TMP6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 +// CHECK4-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], -1 +// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: [[TMP8:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] +// CHECK4-NEXT: br i1 [[TMP8]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK4: .master: +// CHECK4-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] +// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK4-NEXT: [[TMP9:%.*]] = load %class.anon*, %class.anon** [[_TMP2]], align 8 +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast %class.anon* [[L9]] to i8* +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast %class.anon* [[TMP9]] to i8* +// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP10]], i8* align 8 [[TMP11]], i64 40, i1 false) +// CHECK4-NEXT: store %class.anon* [[L9]], %class.anon** [[_TMP10]], align 8 +// CHECK4-NEXT: [[TMP12:%.*]] = load i32*, i32** [[TMP]], align 8 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK4-NEXT: store i32 [[TMP13]], i32* [[B11]], align 4 +// CHECK4-NEXT: store i32* [[B11]], i32** [[_TMP12]], align 8 +// CHECK4-NEXT: [[TMP14:%.*]] = load i32*, i32** [[_TMP1]], align 8 +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4 +// CHECK4-NEXT: store i32 [[TMP15]], i32* [[C13]], align 4 +// CHECK4-NEXT: store i32* [[C13]], i32** [[_TMP14]], align 8 +// CHECK4-NEXT: [[TMP16:%.*]] = load %class.anon*, %class.anon** [[_TMP10]], align 8 +// CHECK4-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP16]], i32 0, i32 0 +// CHECK4-NEXT: store i32* [[CONV]], i32** [[TMP17]], align 8 +// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP16]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP19:%.*]] = load i32*, i32** [[_TMP12]], align 8 +// CHECK4-NEXT: store i32* [[TMP19]], i32** [[TMP18]], align 8 +// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP16]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32*, i32** [[_TMP14]], align 8 +// CHECK4-NEXT: store i32* [[TMP21]], i32** [[TMP20]], align 8 +// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP16]], i32 0, i32 3 +// CHECK4-NEXT: store i32** [[D_ADDR]], i32*** [[TMP22]], align 8 +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP16]], i32 0, i32 4 +// CHECK4-NEXT: store i32* [[TMP2]], i32** [[TMP23]], align 8 +// CHECK4-NEXT: [[TMP24:%.*]] = load %class.anon*, %class.anon** [[_TMP10]], align 8 +// CHECK4-NEXT: [[CALL:%.*]] = call i64 @"_ZZ4mainENK3$_0clEv"(%class.anon* nonnull dereferenceable(40) [[TMP24]]) #[[ATTR7:[0-9]+]] +// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK4: .termination.notifier: +// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTEXIT]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l43 +// CHECK4-SAME: (i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[C:%.*]], i32* [[D:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], %class.anon* nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca %class.anon*, align 8 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[_TMP2:%.*]] = alloca %class.anon*, align 8 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [6 x i8*], align 8 +// CHECK4-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[B]], i32** [[B_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[D]], i32** [[D_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: store %class.anon* [[L]], %class.anon** [[L_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32*, i32** [[C_ADDR]], align 8 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: [[TMP4:%.*]] = load %class.anon*, %class.anon** [[L_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[TMP1]], i32** [[TMP]], align 8 +// CHECK4-NEXT: store i32* [[TMP2]], i32** [[_TMP1]], align 8 +// CHECK4-NEXT: store %class.anon* [[TMP4]], %class.anon** [[_TMP2]], align 8 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK4-NEXT: [[TMP6:%.*]] = load i32*, i32** [[TMP]], align 8 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[_TMP1]], align 8 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32*, i32** [[D_ADDR]], align 8 +// CHECK4-NEXT: [[TMP9:%.*]] = load %class.anon*, %class.anon** [[_TMP2]], align 8 +// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8 +// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK4-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP6]] to i8* +// CHECK4-NEXT: store i8* [[TMP13]], i8** [[TMP12]], align 8 +// CHECK4-NEXT: [[TMP14:%.*]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK4-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK4-NEXT: store i8* [[TMP15]], i8** [[TMP14]], align 8 +// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK4-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP8]] to i8* +// CHECK4-NEXT: store i8* [[TMP17]], i8** [[TMP16]], align 8 +// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK4-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP3]] to i8* +// CHECK4-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 8 +// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 5 +// CHECK4-NEXT: [[TMP21:%.*]] = bitcast %class.anon* [[TMP9]] to i8* +// CHECK4-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK4-NEXT: [[TMP22:%.*]] = bitcast [6 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*, i32*, i32*, i32*, %class.anon*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP22]], i64 6) +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[C:%.*]], i32* [[D:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], %class.anon* nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca %class.anon*, align 8 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[_TMP2:%.*]] = alloca %class.anon*, align 8 +// CHECK4-NEXT: [[L3:%.*]] = alloca [[CLASS_ANON:%.*]], align 8 +// CHECK4-NEXT: [[_TMP4:%.*]] = alloca %class.anon*, align 8 +// CHECK4-NEXT: [[ARGC5:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B6:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP7:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[C8:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP9:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[A10:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[B]], i32** [[B_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[D]], i32** [[D_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: store %class.anon* [[L]], %class.anon** [[L_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32*, i32** [[C_ADDR]], align 8 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: [[TMP4:%.*]] = load %class.anon*, %class.anon** [[L_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[TMP1]], i32** [[TMP]], align 8 +// CHECK4-NEXT: store i32* [[TMP2]], i32** [[_TMP1]], align 8 +// CHECK4-NEXT: store %class.anon* [[TMP4]], %class.anon** [[_TMP2]], align 8 +// CHECK4-NEXT: [[TMP5:%.*]] = load %class.anon*, %class.anon** [[_TMP2]], align 8 +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast %class.anon* [[L3]] to i8* +// CHECK4-NEXT: [[TMP7:%.*]] = bitcast %class.anon* [[TMP5]] to i8* +// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 40, i1 false) +// CHECK4-NEXT: store %class.anon* [[L3]], %class.anon** [[_TMP4]], align 8 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK4-NEXT: store i32 [[TMP8]], i32* [[ARGC5]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32*, i32** [[TMP]], align 8 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4 +// CHECK4-NEXT: store i32 [[TMP10]], i32* [[B6]], align 4 +// CHECK4-NEXT: store i32* [[B6]], i32** [[_TMP7]], align 8 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32*, i32** [[_TMP1]], align 8 +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4 +// CHECK4-NEXT: store i32 [[TMP12]], i32* [[C8]], align 4 +// CHECK4-NEXT: store i32* [[C8]], i32** [[_TMP9]], align 8 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK4-NEXT: store i32 [[TMP13]], i32* [[A10]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = load %class.anon*, %class.anon** [[_TMP4]], align 8 +// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP14]], i32 0, i32 0 +// CHECK4-NEXT: store i32* [[ARGC5]], i32** [[TMP15]], align 8 +// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP14]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32*, i32** [[_TMP7]], align 8 +// CHECK4-NEXT: store i32* [[TMP17]], i32** [[TMP16]], align 8 +// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP14]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP19:%.*]] = load i32*, i32** [[_TMP9]], align 8 +// CHECK4-NEXT: store i32* [[TMP19]], i32** [[TMP18]], align 8 +// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP14]], i32 0, i32 3 +// CHECK4-NEXT: store i32** [[D_ADDR]], i32*** [[TMP20]], align 8 +// CHECK4-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[CLASS_ANON]], %class.anon* [[TMP14]], i32 0, i32 4 +// CHECK4-NEXT: store i32* [[A10]], i32** [[TMP21]], align 8 +// CHECK4-NEXT: [[TMP22:%.*]] = load %class.anon*, %class.anon** [[_TMP4]], align 8 +// CHECK4-NEXT: [[CALL:%.*]] = call i64 @"_ZZ4mainENK3$_0clEv"(%class.anon* nonnull dereferenceable(40) [[TMP22]]) #[[ATTR7]] +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27_worker +// CHECK4-SAME: () #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK4: .await.work: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK4: .select.workers: +// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK4: .execute.parallel: +// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK4-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK4: .terminate.parallel: +// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK4: .barrier.parallel: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27 +// CHECK4-SAME: (%struct.S* [[THIS:%.*]], %class.anon.0* nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK4-NEXT: [[TMP:%.*]] = alloca %class.anon.0*, align 8 +// CHECK4-NEXT: [[L7:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8 +// CHECK4-NEXT: [[_TMP8:%.*]] = alloca %class.anon.0*, align 8 +// CHECK4-NEXT: store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8 +// CHECK4-NEXT: store %class.anon.0* [[L]], %class.anon.0** [[L_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load %class.anon.0*, %class.anon.0** [[L_ADDR]], align 8 +// CHECK4-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP]], align 8 +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK4-NEXT: [[TMP2:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK4-NEXT: br i1 [[TMP2]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK4: .worker: +// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27_worker() #[[ATTR6]] +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .mastercheck: +// CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK4-NEXT: [[TMP4:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK4-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], -1 +// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP4]], [[TMP5]] +// CHECK4-NEXT: [[TMP6:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK4-NEXT: br i1 [[TMP6]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK4: .master: +// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK4-NEXT: [[TMP7:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast %class.anon.0* [[L7]] to i8* +// CHECK4-NEXT: [[TMP9:%.*]] = bitcast %class.anon.0* [[TMP7]] to i8* +// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 8, i1 false) +// CHECK4-NEXT: store %class.anon.0* [[L7]], %class.anon.0** [[_TMP8]], align 8 +// CHECK4-NEXT: [[TMP10:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP8]], align 8 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[TMP10]], i32 0, i32 0 +// CHECK4-NEXT: store %struct.S* [[TMP0]], %struct.S** [[TMP11]], align 8 +// CHECK4-NEXT: [[TMP12:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP8]], align 8 +// CHECK4-NEXT: [[CALL:%.*]] = call i32 @_ZZN1S3fooEvENKUlvE_clEv(%class.anon.0* nonnull dereferenceable(8) [[TMP12]]) #[[ATTR7]] +// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK4: .termination.notifier: +// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTEXIT]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_ZZN1S3fooEvENKUlvE_clEv +// CHECK4-SAME: (%class.anon.0* nonnull dereferenceable(8) [[THIS:%.*]]) #[[ATTR4:[0-9]+]] comdat align 2 { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[THIS_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK4-NEXT: store %class.anon.0* [[THIS]], %class.anon.0** [[THIS_ADDR]], align 8 +// CHECK4-NEXT: [[THIS1:%.*]] = load %class.anon.0*, %class.anon.0** [[THIS_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[CLASS_ANON_0:%.*]], %class.anon.0* [[THIS1]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP1:%.*]] = load %struct.S*, %struct.S** [[TMP0]], align 8 +// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[TMP1]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[A]], align 4 +// CHECK4-NEXT: ret i32 [[TMP2]] +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29 +// CHECK4-SAME: (%struct.S* [[THIS:%.*]], %class.anon.0* nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK4-NEXT: [[TMP:%.*]] = alloca %class.anon.0*, align 8 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 +// CHECK4-NEXT: store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8 +// CHECK4-NEXT: store %class.anon.0* [[L]], %class.anon.0** [[L_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load %class.anon.0*, %class.anon.0** [[L_ADDR]], align 8 +// CHECK4-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP]], align 8 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK4-NEXT: [[TMP3:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 +// CHECK4-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast %struct.S* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 +// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK4-NEXT: [[TMP7:%.*]] = bitcast %class.anon.0* [[TMP3]] to i8* +// CHECK4-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, %struct.S*, %class.anon.0*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP8]], i64 2) +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], %struct.S* [[THIS:%.*]], %class.anon.0* nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK4-NEXT: [[TMP:%.*]] = alloca %class.anon.0*, align 8 +// CHECK4-NEXT: [[L1:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8 +// CHECK4-NEXT: [[_TMP2:%.*]] = alloca %class.anon.0*, align 8 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8 +// CHECK4-NEXT: store %class.anon.0* [[L]], %class.anon.0** [[L_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load %class.anon.0*, %class.anon.0** [[L_ADDR]], align 8 +// CHECK4-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP]], align 8 +// CHECK4-NEXT: [[TMP2:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 +// CHECK4-NEXT: [[TMP3:%.*]] = bitcast %class.anon.0* [[L1]] to i8* +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast %class.anon.0* [[TMP2]] to i8* +// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 8, i1 false) +// CHECK4-NEXT: store %class.anon.0* [[L1]], %class.anon.0** [[_TMP2]], align 8 +// CHECK4-NEXT: [[TMP5:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP2]], align 8 +// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: store %struct.S* [[TMP0]], %struct.S** [[TMP6]], align 8 +// CHECK4-NEXT: [[TMP7:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP2]], align 8 +// CHECK4-NEXT: [[CALL:%.*]] = call i32 @_ZZN1S3fooEvENKUlvE_clEv(%class.anon.0* nonnull dereferenceable(8) [[TMP7]]) #[[ATTR7]] +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIZN1S3fooEvEUlvE_EiRKT__l18 +// CHECK4-SAME: (%class.anon.0* nonnull align 8 dereferenceable(8) [[T:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[T_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK4-NEXT: [[TMP:%.*]] = alloca %class.anon.0*, align 8 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK4-NEXT: store %class.anon.0* [[T]], %class.anon.0** [[T_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load %class.anon.0*, %class.anon.0** [[T_ADDR]], align 8 +// CHECK4-NEXT: store %class.anon.0* [[TMP0]], %class.anon.0** [[TMP]], align 8 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 +// CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast %class.anon.0* [[TMP2]] to i8* +// CHECK4-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8 +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, %class.anon.0*)* @__omp_outlined__2 to i8*), i8* null, i8** [[TMP5]], i64 1) +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], %class.anon.0* nonnull align 8 dereferenceable(8) [[T:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[T_ADDR:%.*]] = alloca %class.anon.0*, align 8 +// CHECK4-NEXT: [[TMP:%.*]] = alloca %class.anon.0*, align 8 +// CHECK4-NEXT: [[T1:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8 +// CHECK4-NEXT: [[_TMP2:%.*]] = alloca %class.anon.0*, align 8 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store %class.anon.0* [[T]], %class.anon.0** [[T_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load %class.anon.0*, %class.anon.0** [[T_ADDR]], align 8 +// CHECK4-NEXT: store %class.anon.0* [[TMP0]], %class.anon.0** [[TMP]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 +// CHECK4-NEXT: [[TMP2:%.*]] = bitcast %class.anon.0* [[T1]] to i8* +// CHECK4-NEXT: [[TMP3:%.*]] = bitcast %class.anon.0* [[TMP1]] to i8* +// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 8, i1 false) +// CHECK4-NEXT: store %class.anon.0* [[T1]], %class.anon.0** [[_TMP2]], align 8 +// CHECK4-NEXT: [[TMP4:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP2]], align 8 +// CHECK4-NEXT: [[TMP5:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP2]], align 8 +// CHECK4-NEXT: [[CALL:%.*]] = call i32 @_ZZN1S3fooEvENKUlvE_clEv(%class.anon.0* nonnull dereferenceable(8) [[TMP5]]) #[[ATTR7]] +// CHECK4-NEXT: ret void +// diff --git a/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp @@ -0,0 +1,401 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK1 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK3 + +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +void work(); + +void use() { + #pragma omp parallel + work(); +} + +int main() { + #pragma omp target parallel + { use(); } + #pragma omp target + { use(); } +} + +#endif +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l21 +// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK1-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP1]], i64 0) +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: call void @_Z3usev() #[[ATTR7:[0-9]+]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_Z3usev +// CHECK1-SAME: () #[[ATTR2:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP1]], i64 0) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: call void @_Z4workv() #[[ATTR7]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_worker +// CHECK1-SAME: () #[[ATTR5:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23 +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_worker() #[[ATTR4:[0-9]+]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK1-NEXT: call void @_Z3usev() #[[ATTR7]] +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l21 +// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK2-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP1]], i32 0) +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: call void @_Z3usev() #[[ATTR7:[0-9]+]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_Z3usev +// CHECK2-SAME: () #[[ATTR2:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP1]], i32 0) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: call void @_Z4workv() #[[ATTR7]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_worker +// CHECK2-SAME: () #[[ATTR5:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23 +// CHECK2-SAME: () #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_worker() #[[ATTR4:[0-9]+]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: call void @_Z3usev() #[[ATTR7]] +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l21 +// CHECK3-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK3-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP1]], i32 0) +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: call void @_Z3usev() #[[ATTR7:[0-9]+]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_Z3usev +// CHECK3-SAME: () #[[ATTR2:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP1]], i32 0) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: call void @_Z4workv() #[[ATTR7]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_worker +// CHECK3-SAME: () #[[ATTR5:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23 +// CHECK3-SAME: () #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_worker() #[[ATTR4:[0-9]+]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK3-NEXT: call void @_Z3usev() #[[ATTR7]] +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// diff --git a/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp @@ -0,0 +1,652 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK1 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK3 + +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +void work(int *C) { + #pragma omp atomic + ++(*C); +} + +void use(int *C) { + #pragma omp parallel num_threads(2) + work(C); +} + +int main() { + int C = 0; + #pragma omp target map(C) + { + use(&C); + #pragma omp parallel num_threads(2) + use(&C); + } + + return C; +} + +#endif +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker +// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK1: .execute.fn: +// CHECK1-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR5:[0-9]+]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .check.next: +// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CHECK1: .execute.fn2: +// CHECK1-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR5]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .check.next3: +// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP7]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25 +// CHECK1-SAME: (i32* nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker() #[[ATTR5]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR7:[0-9]+]] +// CHECK1-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 2) +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP9]], i64 1) +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_Z3usePi +// CHECK1-SAME: (i32* [[C:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8 +// CHECK1-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 2) +// CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP2:%.*]] = bitcast i32** [[C_ADDR]] to i8* +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[TMP1]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP3]], i64 1) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32** nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i32**, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32** [[C]], i32*** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[TMP0]], align 8 +// CHECK1-NEXT: call void @_Z4workPi(i32* [[TMP1]]) #[[ATTR7]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_Z4workPi +// CHECK1-SAME: (i32* [[C:%.*]]) #[[ATTR3]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK1-NEXT: [[TMP2:%.*]] = bitcast i32* [[ATOMIC_TEMP]] to i8* +// CHECK1-NEXT: call void @__atomic_load(i64 4, i8* [[TMP1]], i8* [[TMP2]], i32 0) #[[ATTR7]] +// CHECK1-NEXT: br label [[ATOMIC_CONT:%.*]] +// CHECK1: atomic_cont: +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[ATOMIC_TEMP]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 +// CHECK1-NEXT: store i32 [[ADD]], i32* [[ATOMIC_TEMP1]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i32* [[ATOMIC_TEMP]] to i8* +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i32* [[ATOMIC_TEMP1]] to i8* +// CHECK1-NEXT: [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, i8* [[TMP4]], i8* [[TMP5]], i8* [[TMP6]], i32 0, i32 0) #[[ATTR7]] +// CHECK1-NEXT: br i1 [[CALL]], label [[ATOMIC_EXIT:%.*]], label [[ATOMIC_CONT]] +// CHECK1: atomic_exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32*** +// CHECK1-NEXT: [[TMP5:%.*]] = load i32**, i32*** [[TMP4]], align 8 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32** [[TMP5]]) #[[ATTR5]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[C_ADDR]], align 8 +// CHECK1-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR7]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR5]] +// CHECK1-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker +// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK2: .execute.fn: +// CHECK2-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR5:[0-9]+]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .check.next: +// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CHECK2: .execute.fn2: +// CHECK2-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR5]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .check.next3: +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP7]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25 +// CHECK2-SAME: (i32* nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK2-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[C_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker() #[[ATTR5]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR7:[0-9]+]] +// CHECK2-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 2) +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP9]], i32 1) +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_Z3usePi +// CHECK2-SAME: (i32* [[C:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 4 +// CHECK2-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 2) +// CHECK2-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP2:%.*]] = bitcast i32** [[C_ADDR]] to i8* +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[TMP1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP3]], i32 1) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32** nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i32**, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32** [[C]], i32*** [[C_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[C_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[TMP0]], align 4 +// CHECK2-NEXT: call void @_Z4workPi(i32* [[TMP1]]) #[[ATTR7]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_Z4workPi +// CHECK2-SAME: (i32* [[C:%.*]]) #[[ATTR3]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[C_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK2-NEXT: [[TMP2:%.*]] = bitcast i32* [[ATOMIC_TEMP]] to i8* +// CHECK2-NEXT: call void @__atomic_load(i32 4, i8* [[TMP1]], i8* [[TMP2]], i32 0) #[[ATTR7]] +// CHECK2-NEXT: br label [[ATOMIC_CONT:%.*]] +// CHECK2: atomic_cont: +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[ATOMIC_TEMP]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 +// CHECK2-NEXT: store i32 [[ADD]], i32* [[ATOMIC_TEMP1]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i32* [[ATOMIC_TEMP]] to i8* +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i32* [[ATOMIC_TEMP1]] to i8* +// CHECK2-NEXT: [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 4, i8* [[TMP4]], i8* [[TMP5]], i8* [[TMP6]], i32 0, i32 0) #[[ATTR7]] +// CHECK2-NEXT: br i1 [[CALL]], label [[ATOMIC_EXIT:%.*]], label [[ATOMIC_CONT]] +// CHECK2: atomic_exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32*** +// CHECK2-NEXT: [[TMP5:%.*]] = load i32**, i32*** [[TMP4]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32** [[TMP5]]) #[[ATTR5]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[C_ADDR]], align 4 +// CHECK2-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR7]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR5]] +// CHECK2-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker +// CHECK3-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK3-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK3: .execute.fn: +// CHECK3-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR5:[0-9]+]] +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .check.next: +// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK3-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CHECK3: .execute.fn2: +// CHECK3-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR5]] +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK3: .check.next3: +// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP7]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25 +// CHECK3-SAME: (i32* nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker() #[[ATTR5]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK3-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK3-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR7:[0-9]+]] +// CHECK3-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 2) +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP9]], i32 1) +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_Z3usePi +// CHECK3-SAME: (i32* [[C:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 4 +// CHECK3-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 2) +// CHECK3-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP2:%.*]] = bitcast i32** [[C_ADDR]] to i8* +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[TMP1]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP3]], i32 1) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32** nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i32**, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32** [[C]], i32*** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[TMP0]], align 4 +// CHECK3-NEXT: call void @_Z4workPi(i32* [[TMP1]]) #[[ATTR7]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_Z4workPi +// CHECK3-SAME: (i32* [[C:%.*]]) #[[ATTR3]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK3-NEXT: [[TMP2:%.*]] = bitcast i32* [[ATOMIC_TEMP]] to i8* +// CHECK3-NEXT: call void @__atomic_load(i32 4, i8* [[TMP1]], i8* [[TMP2]], i32 0) #[[ATTR7]] +// CHECK3-NEXT: br label [[ATOMIC_CONT:%.*]] +// CHECK3: atomic_cont: +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[ATOMIC_TEMP]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 +// CHECK3-NEXT: store i32 [[ADD]], i32* [[ATOMIC_TEMP1]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i32* [[ATOMIC_TEMP]] to i8* +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i32* [[ATOMIC_TEMP1]] to i8* +// CHECK3-NEXT: [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 4, i8* [[TMP4]], i8* [[TMP5]], i8* [[TMP6]], i32 0, i32 0) #[[ATTR7]] +// CHECK3-NEXT: br i1 [[CALL]], label [[ATOMIC_EXIT:%.*]], label [[ATOMIC_CONT]] +// CHECK3: atomic_exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32*** +// CHECK3-NEXT: [[TMP5:%.*]] = load i32**, i32*** [[TMP4]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32** [[TMP5]]) #[[ATTR5]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[C_ADDR]], align 4 +// CHECK3-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR7]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR5]] +// CHECK3-NEXT: ret void +// diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp @@ -1,12 +1,12 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK1 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK2 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -disable-O0-optnone | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -disable-O0-optnone | FileCheck %s --check-prefix CHECK4 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK5 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK6 // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -75,311 +75,2446 @@ return a; } -// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] } -// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef -// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef -// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 - -// CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l20}}_worker() - -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l29}}_worker() -// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, -// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, -// CHECK: store i8* null, i8** [[OMP_WORK_FN]], -// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], -// CHECK: br label {{%?}}[[AWAIT_WORK:.+]] -// -// CHECK: [[AWAIT_WORK]] -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) -// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 -// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 -// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], -// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null -// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] -// -// CHECK: [[SEL_WORKERS]] -// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]] -// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 -// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] -// -// CHECK: [[EXEC_PARALLEL]] -// CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]], -// CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32)* [[PARALLEL_FN1:@.+]]_wrapper to i8*) -// CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]] -// -// CHECK: [[EXEC_PFN1]] -// CHECK: call void [[PARALLEL_FN1]]_wrapper( -// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] -// -// CHECK: [[CHECK_NEXT1]] -// CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]], -// CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32)* [[PARALLEL_FN2:@.+]]_wrapper to i8*) -// CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]] -// -// CHECK: [[EXEC_PFN2]] -// CHECK: call void [[PARALLEL_FN2]]_wrapper( -// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] -// -// CHECK: [[CHECK_NEXT2]] -// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] -// -// CHECK: [[TERM_PARALLEL]] -// CHECK: call void @__kmpc_kernel_end_parallel() -// CHECK: br label {{%?}}[[BAR_PARALLEL]] -// -// CHECK: [[BAR_PARALLEL]] -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: br label {{%?}}[[AWAIT_WORK]] -// -// CHECK: [[EXIT]] -// CHECK: ret void - -// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l29]](i[[SZ:32|64]] -// Create local storage for each capture. -// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]], -// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] -// Store captures in the context. -// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32* -// -// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]] -// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] -// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] -// -// CHECK: [[WORKER]] -// CHECK: {{call|invoke}} void [[T6]]_worker() -// CHECK: br label {{%?}}[[EXIT:.+]] -// -// CHECK: [[CHECK_MASTER]] -// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], -// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] -// -// CHECK: [[MASTER]] -// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]] -// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] -// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN1]]_wrapper to i8*)) -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: call void @__kmpc_serialized_parallel( -// CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]]( -// CHECK: call void @__kmpc_end_serialized_parallel( -// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN2]]_wrapper to i8*)) -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK-64-DAG: load i32, i32* [[REF_A]] -// CHECK-32-DAG: load i32, i32* [[LOCAL_A]] -// CHECK: br label {{%?}}[[TERMINATE:.+]] -// -// CHECK: [[TERMINATE]] -// CHECK: call void @__kmpc_kernel_deinit( -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: br label {{%?}}[[EXIT]] -// -// CHECK: [[EXIT]] -// CHECK: ret void - -// CHECK-DAG: define internal void [[PARALLEL_FN1]]( -// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]], -// CHECK: store i[[SZ]] 42, i[[SZ]]* %a, -// CHECK: ret void - -// CHECK-DAG: define internal void [[PARALLEL_FN3]]( -// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]], -// CHECK: store i[[SZ]] 43, i[[SZ]]* %a, -// CHECK: ret void - -// CHECK-DAG: define internal void [[PARALLEL_FN2]]( -// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]], -// CHECK: store i[[SZ]] 44, i[[SZ]]* %a, -// CHECK: ret void - -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l46}}_worker() -// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, -// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, -// CHECK: store i8* null, i8** [[OMP_WORK_FN]], -// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], -// CHECK: br label {{%?}}[[AWAIT_WORK:.+]] -// -// CHECK: [[AWAIT_WORK]] -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) -// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 -// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 -// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], -// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null -// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] -// -// CHECK: [[SEL_WORKERS]] -// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]] -// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 -// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] -// -// CHECK: [[EXEC_PARALLEL]] -// CHECK: [[WF:%.+]] = load i8*, i8** [[OMP_WORK_FN]], -// CHECK: [[WM:%.+]] = icmp eq i8* [[WF]], bitcast (void (i16, i32)* [[PARALLEL_FN4:@.+]]_wrapper to i8*) -// CHECK: br i1 [[WM]], label {{%?}}[[EXEC_PFN:.+]], label {{%?}}[[CHECK_NEXT:.+]] -// -// CHECK: [[EXEC_PFN]] -// CHECK: call void [[PARALLEL_FN4]]_wrapper( -// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] -// -// CHECK: [[CHECK_NEXT]] -// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] -// -// CHECK: [[TERM_PARALLEL]] -// CHECK: call void @__kmpc_kernel_end_parallel() -// CHECK: br label {{%?}}[[BAR_PARALLEL]] -// -// CHECK: [[BAR_PARALLEL]] -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: br label {{%?}}[[AWAIT_WORK]] -// -// CHECK: [[EXIT]] -// CHECK: ret void - -// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l46]](i[[SZ:32|64]] -// Create local storage for each capture. -// CHECK: [[LOCAL_N:%.+]] = alloca i[[SZ]], -// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]], -// CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]], -// CHECK: [[LOCAL_B:%.+]] = alloca [10 x i32]* -// CHECK-DAG: store i[[SZ]] [[ARG_N:%.+]], i[[SZ]]* [[LOCAL_N]] -// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] -// CHECK-DAG: store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]] -// CHECK-DAG: store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]] -// Store captures in the context. -// CHECK-64-DAG:[[REF_N:%.+]] = bitcast i[[SZ]]* [[LOCAL_N]] to i32* -// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32* -// CHECK-DAG: [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16* -// CHECK-DAG: [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]], -// -// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]] -// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] -// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] -// -// CHECK: [[WORKER]] -// CHECK: {{call|invoke}} void [[T6]]_worker() -// CHECK: br label {{%?}}[[EXIT:.+]] -// -// CHECK: [[CHECK_MASTER]] -// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], -// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] -// -// CHECK: [[MASTER]] -// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]] -// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] -// CHECK-64: [[N:%.+]] = load i32, i32* [[REF_N]], -// CHECK-32: [[N:%.+]] = load i32, i32* [[LOCAL_N]], -// CHECK: [[CMP:%.+]] = icmp sgt i32 [[N]], 1000 -// CHECK: br i1 [[CMP]], label {{%?}}[[IF_THEN:.+]], label {{%?}}[[IF_ELSE:.+]] -// -// CHECK: [[IF_THEN]] -// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN4]]_wrapper to i8*)) -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: br label {{%?}}[[IF_END:.+]] -// -// CHECK: [[IF_ELSE]] -// CHECK: call void @__kmpc_serialized_parallel( -// CHECK: {{call|invoke}} void [[PARALLEL_FN4]]( -// CHECK: call void @__kmpc_end_serialized_parallel( -// br label [[IF_END]] -// -// CHECK: [[IF_END]] -// CHECK-64-DAG: load i32, i32* [[REF_A]] -// CHECK-32-DAG: load i32, i32* [[LOCAL_A]] -// CHECK-DAG: load i16, i16* [[REF_AA]] -// CHECK-DAG: getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2 -// -// CHECK: br label {{%?}}[[TERMINATE:.+]] -// -// CHECK: [[TERMINATE]] -// CHECK: call void @__kmpc_kernel_deinit( -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: br label {{%?}}[[EXIT]] -// -// CHECK: [[EXIT]] -// CHECK: ret void - -// CHECK: noinline -// CHECK-NEXT: define internal void [[PARALLEL_FN4]]( -// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]], -// CHECK: store i[[SZ]] 45, i[[SZ]]* %a, -// CHECK: call void @__kmpc_barrier(%struct.ident_t* @{{.+}}, i32 %{{.+}}) -// CHECK: ret void - -// CHECK: declare void @__kmpc_barrier(%struct.ident_t*, i32) #[[#CONVERGENT:]] - -// CHECK: Function Attrs: convergent noinline norecurse nounwind -// CHECK-NEXT: [[PARALLEL_FN4]]_wrapper - -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l58}}_worker() -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l58}}( -// CHECK-32: [[A_ADDR:%.+]] = alloca i32, -// CHECK-64: [[A_ADDR:%.+]] = alloca i64, -// CHECK-64: [[CONV:%.+]] = bitcast i64* [[A_ADDR]] to i32* -// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], -// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ: [[STACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 -// PAR: [[STACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) -// CHECK: [[BC:%.+]] = bitcast i8* [[STACK]] to %struct._globalized_locals_ty* -// CHECK-32: [[A:%.+]] = load i32, i32* [[A_ADDR]], -// CHECK-64: [[A:%.+]] = load i32, i32* [[CONV]], -// CHECK: [[GLOBAL_A_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 -// CHECK: store i32 [[A]], i32* [[GLOBAL_A_ADDR]], -// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[IS_SHARED]]) -// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[STACK]]) - -// CHECK-LABEL: define internal void @{{.+}}(i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* nonnull align {{[0-9]+}} dereferenceable{{.*}}) -// CHECK: [[CC:%.+]] = alloca i32, -// CHECK: [[MASK:%.+]] = call i32 @__kmpc_warp_active_thread_mask(){{$}} -// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK: [[NUM_THREADS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: store i32 0, i32* [[CC]], -// CHECK: br label - -// CHECK: [[CC_VAL:%.+]] = load i32, i32* [[CC]], -// CHECK: [[RES:%.+]] = icmp slt i32 [[CC_VAL]], [[NUM_THREADS]] -// CHECK: br i1 [[RES]], label - -// CHECK: [[CC_VAL:%.+]] = load i32, i32* [[CC]], -// CHECK: [[RES:%.+]] = icmp eq i32 [[TID]], [[CC_VAL]] -// CHECK: br i1 [[RES]], label - -// CHECK: call void @__kmpc_critical( -// CHECK: load i32, i32* -// CHECK: add nsw i32 -// CHECK: store i32 -// CHECK: call void @__kmpc_end_critical( - -// CHECK: call void @__kmpc_syncwarp(i32 [[MASK]]){{$}} -// CHECK: [[NEW_CC_VAL:%.+]] = add nsw i32 [[CC_VAL]], 1 -// CHECK: store i32 [[NEW_CC_VAL]], i32* [[CC]], -// CHECK: br label - -// CHECK: declare i32 @__kmpc_warp_active_thread_mask() #[[#CONVERGENT:]] -// CHECK: declare void @__kmpc_syncwarp(i32) #[[#CONVERGENT:]] - -// CHECK: attributes #[[#CONVERGENT:]] = {{.*}} convergent {{.*}} - #endif +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker +// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK1: .execute.fn: +// CHECK1-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .check.next: +// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CHECK1: .execute.fn2: +// CHECK1-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .check.next3: +// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] +// CHECK1: .execute.fn5: +// CHECK1-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .check.next6: +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 +// CHECK1-SAME: (i64 [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 8 +// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i64 0) +// CHECK1-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i64 0) +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i64 0) +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 +// CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32 42, i32* [[A]], align 4 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32 43, i32* [[A]], align 4 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32 44, i32* [[A]], align 4 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK1: .execute.fn: +// CHECK1-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .check.next: +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 +// CHECK1-SAME: (i64 [[N:%.*]], i64 [[A:%.*]], i64 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[AA_ADDR]] to i16* +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 +// CHECK1-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 +// CHECK1-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i64 0) +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV2]], align 8 +// CHECK1-NEXT: [[CONV9:%.*]] = sext i16 [[TMP11]] to i32 +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], 1 +// CHECK1-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16 +// CHECK1-NEXT: store i16 [[CONV11]], i16* [[CONV2]], align 8 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK1-NEXT: store i32 [[ADD12]], i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32 45, i32* [[A]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK1: .execute.fn: +// CHECK1-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .check.next: +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 +// CHECK1-SAME: (i64 [[A:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK1-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 +// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty* +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0 +// CHECK1-NEXT: store i32 [[TMP10]], i32* [[A7]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP13:%.*]] = bitcast i32* [[A7]] to i8* +// CHECK1-NEXT: store i8* [[TMP13]], i8** [[TMP12]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP14]], i64 1) +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[A7]], align 4 +// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK1-NEXT: store i32 [[INC]], i32* [[A7]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP16]]) +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] +// CHECK1: omp.critical.loop: +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] +// CHECK1-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] +// CHECK1: omp.critical.test: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] +// CHECK1-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] +// CHECK1: omp.critical.body: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK1-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 +// CHECK1-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK1-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK1-NEXT: br label [[OMP_CRITICAL_SYNC]] +// CHECK1: omp.critical.sync: +// CHECK1-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) +// CHECK1-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK1-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 +// CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP]] +// CHECK1: omp.critical.exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +// CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker +// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK2: .execute.fn: +// CHECK2-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .check.next: +// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CHECK2: .execute.fn2: +// CHECK2-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .check.next3: +// CHECK2-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] +// CHECK2: .execute.fn5: +// CHECK2-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .check.next6: +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 +// CHECK2-SAME: (i64 [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 8 +// CHECK2-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i64 0) +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i64 0) +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i64 0) +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 +// CHECK2-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32 42, i32* [[A]], align 4 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32 43, i32* [[A]], align 4 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32 44, i32* [[A]], align 4 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker +// CHECK2-SAME: () #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK2: .execute.fn: +// CHECK2-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .check.next: +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 +// CHECK2-SAME: (i64 [[N:%.*]], i64 [[A:%.*]], i64 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK2-NEXT: [[CONV2:%.*]] = bitcast i64* [[AA_ADDR]] to i16* +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 +// CHECK2-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 +// CHECK2-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i64 0) +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK2-NEXT: store i32 [[ADD]], i32* [[CONV1]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV2]], align 8 +// CHECK2-NEXT: [[CONV9:%.*]] = sext i16 [[TMP11]] to i32 +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], 1 +// CHECK2-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16 +// CHECK2-NEXT: store i16 [[CONV11]], i16* [[CONV2]], align 8 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK2-NEXT: store i32 [[ADD12]], i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32 45, i32* [[A]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker +// CHECK2-SAME: () #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK2: .execute.fn: +// CHECK2-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .check.next: +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 +// CHECK2-SAME: (i64 [[A:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK2-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: store i32 [[TMP7]], i32* [[A7]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i32* [[A7]] to i8* +// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i64 1) +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[A7]], align 4 +// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK2-NEXT: store i32 [[INC]], i32* [[A7]], align 4 +// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] +// CHECK2: omp.critical.loop: +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] +// CHECK2-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] +// CHECK2: omp.critical.test: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] +// CHECK2-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] +// CHECK2: omp.critical.body: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK2-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 +// CHECK2-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK2-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK2-NEXT: br label [[OMP_CRITICAL_SYNC]] +// CHECK2: omp.critical.sync: +// CHECK2-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) +// CHECK2-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK2-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 +// CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP]] +// CHECK2: omp.critical.exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +// CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker +// CHECK4-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK4: .await.work: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK4: .select.workers: +// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK4: .execute.parallel: +// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK4-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK4: .execute.fn: +// CHECK4-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2:[0-9]+]] +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK4: .check.next: +// CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK4-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CHECK4: .execute.fn2: +// CHECK4-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK4: .check.next3: +// CHECK4-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) +// CHECK4-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] +// CHECK4: .execute.fn5: +// CHECK4-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK4: .check.next6: +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK4-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK4: .terminate.parallel: +// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK4: .barrier.parallel: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 +// CHECK4-SAME: (i32 [[A:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4 +// CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK4: .worker: +// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .mastercheck: +// CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK4: .master: +// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK4-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0) +// CHECK4-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0) +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0) +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 +// CHECK4-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK4: .termination.notifier: +// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTEXIT]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 42, i32* [[A]], align 4 +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 43, i32* [[A]], align 4 +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK4-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 44, i32* [[A]], align 4 +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +// CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK4-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker +// CHECK4-SAME: () #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK4: .await.work: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK4: .select.workers: +// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK4: .execute.parallel: +// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) +// CHECK4-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK4: .execute.fn: +// CHECK4-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK4: .check.next: +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK4-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK4: .terminate.parallel: +// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK4: .barrier.parallel: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 +// CHECK4-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK4-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK4-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK4: .worker: +// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .mastercheck: +// CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK4-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK4-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK4-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK4-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK4: .master: +// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK4-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 +// CHECK4-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 +// CHECK4-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK4-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK4-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 +// CHECK4-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 +// CHECK4-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK4-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK4: .termination.notifier: +// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTEXIT]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 45, i32* [[A]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +// CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK4-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker +// CHECK4-SAME: () #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK4: .await.work: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK4: .select.workers: +// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK4: .execute.parallel: +// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) +// CHECK4-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK4: .execute.fn: +// CHECK4-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK4: .check.next: +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK4-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK4: .terminate.parallel: +// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK4: .barrier.parallel: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 +// CHECK4-SAME: (i32 [[A:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK4: .worker: +// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .mastercheck: +// CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK4: .master: +// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK4-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK4-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 +// CHECK4-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty* +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK4-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0 +// CHECK4-NEXT: store i32 [[TMP10]], i32* [[A7]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP13:%.*]] = bitcast i32* [[A7]] to i8* +// CHECK4-NEXT: store i8* [[TMP13]], i8** [[TMP12]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP14]], i32 1) +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[A7]], align 4 +// CHECK4-NEXT: [[INC:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK4-NEXT: store i32 [[INC]], i32* [[A7]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP16]]) +// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK4: .termination.notifier: +// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTEXIT]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK4-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] +// CHECK4: omp.critical.loop: +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] +// CHECK4-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] +// CHECK4: omp.critical.test: +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] +// CHECK4-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] +// CHECK4: omp.critical.body: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK4-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK4-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 +// CHECK4-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK4-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK4-NEXT: br label [[OMP_CRITICAL_SYNC]] +// CHECK4: omp.critical.sync: +// CHECK4-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) +// CHECK4-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 +// CHECK4-NEXT: br label [[OMP_CRITICAL_LOOP]] +// CHECK4: omp.critical.exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper +// CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR2]] +// CHECK4-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker +// CHECK5-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK5: .await.work: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK5: .select.workers: +// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK5: .execute.parallel: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK5: .execute.fn: +// CHECK5-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK5: .check.next: +// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK5-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CHECK5: .execute.fn2: +// CHECK5-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK5: .check.next3: +// CHECK5-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) +// CHECK5-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] +// CHECK5: .execute.fn5: +// CHECK5-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK5: .check.next6: +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK5-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK5: .terminate.parallel: +// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK5: .barrier.parallel: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 +// CHECK5-SAME: (i32 [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4 +// CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK5: .worker: +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .mastercheck: +// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK5: .master: +// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK5-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0) +// CHECK5-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0) +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0) +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 +// CHECK5-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK5: .termination.notifier: +// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTEXIT]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 42, i32* [[A]], align 4 +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 43, i32* [[A]], align 4 +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK5-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 44, i32* [[A]], align 4 +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK5-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker +// CHECK5-SAME: () #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK5: .await.work: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK5: .select.workers: +// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK5: .execute.parallel: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) +// CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK5: .execute.fn: +// CHECK5-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK5: .check.next: +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK5-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK5: .terminate.parallel: +// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK5: .barrier.parallel: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 +// CHECK5-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK5-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK5-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK5: .worker: +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .mastercheck: +// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK5-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK5-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK5-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK5: .master: +// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK5-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 +// CHECK5-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 +// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK5-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK5-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 +// CHECK5-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 +// CHECK5-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK5-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK5: .termination.notifier: +// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTEXIT]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 45, i32* [[A]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK5-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker +// CHECK5-SAME: () #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK5: .await.work: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK5: .select.workers: +// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK5: .execute.parallel: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) +// CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK5: .execute.fn: +// CHECK5-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK5: .check.next: +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK5-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK5: .terminate.parallel: +// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK5: .barrier.parallel: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 +// CHECK5-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK5: .worker: +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .mastercheck: +// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK5: .master: +// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK5-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: store i32 [[TMP7]], i32* [[A7]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i32* [[A7]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i32 1) +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[A7]], align 4 +// CHECK5-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK5-NEXT: store i32 [[INC]], i32* [[A7]], align 4 +// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK5: .termination.notifier: +// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTEXIT]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK5-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] +// CHECK5: omp.critical.loop: +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] +// CHECK5-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] +// CHECK5: omp.critical.test: +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] +// CHECK5-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] +// CHECK5: omp.critical.body: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK5-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 +// CHECK5-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK5-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK5-NEXT: br label [[OMP_CRITICAL_SYNC]] +// CHECK5: omp.critical.sync: +// CHECK5-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) +// CHECK5-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 +// CHECK5-NEXT: br label [[OMP_CRITICAL_LOOP]] +// CHECK5: omp.critical.exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper +// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker +// CHECK6-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK6: .await.work: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK6: .select.workers: +// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK6: .execute.parallel: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK6-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK6: .execute.fn: +// CHECK6-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK6: .check.next: +// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK6-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CHECK6: .execute.fn2: +// CHECK6-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK6: .check.next3: +// CHECK6-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) +// CHECK6-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] +// CHECK6: .execute.fn5: +// CHECK6-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK6: .check.next6: +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK6-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK6: .terminate.parallel: +// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK6: .barrier.parallel: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 +// CHECK6-SAME: (i32 [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4 +// CHECK6-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK6: .worker: +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .mastercheck: +// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK6: .master: +// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK6-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0) +// CHECK6-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0) +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0) +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 +// CHECK6-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK6: .termination.notifier: +// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTEXIT]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 42, i32* [[A]], align 4 +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 43, i32* [[A]], align 4 +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK6-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 44, i32* [[A]], align 4 +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK6-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker +// CHECK6-SAME: () #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK6: .await.work: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK6: .select.workers: +// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK6: .execute.parallel: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) +// CHECK6-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK6: .execute.fn: +// CHECK6-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK6: .check.next: +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK6-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK6: .terminate.parallel: +// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK6: .barrier.parallel: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 +// CHECK6-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK6-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK6-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK6: .worker: +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .mastercheck: +// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK6-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK6-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK6-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK6-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK6: .master: +// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK6-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 +// CHECK6-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 +// CHECK6-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK6-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK6-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 +// CHECK6-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 +// CHECK6-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK6-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK6: .termination.notifier: +// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTEXIT]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 45, i32* [[A]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK6-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker +// CHECK6-SAME: () #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK6: .await.work: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK6: .select.workers: +// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK6: .execute.parallel: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) +// CHECK6-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK6: .execute.fn: +// CHECK6-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK6: .check.next: +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK6-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK6: .terminate.parallel: +// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK6: .barrier.parallel: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 +// CHECK6-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK6-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK6: .worker: +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .mastercheck: +// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK6: .master: +// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK6-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK6-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: store i32 [[TMP7]], i32* [[A7]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i32* [[A7]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i32 1) +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[A7]], align 4 +// CHECK6-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK6-NEXT: store i32 [[INC]], i32* [[A7]], align 4 +// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK6: .termination.notifier: +// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTEXIT]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK6-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] +// CHECK6: omp.critical.loop: +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] +// CHECK6-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] +// CHECK6: omp.critical.test: +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] +// CHECK6-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] +// CHECK6: omp.critical.body: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK6-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 +// CHECK6-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK6-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK6-NEXT: br label [[OMP_CRITICAL_SYNC]] +// CHECK6: omp.critical.sync: +// CHECK6-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) +// CHECK6-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK6-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 +// CHECK6-NEXT: br label [[OMP_CRITICAL_LOOP]] +// CHECK6: omp.critical.exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper +// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// diff --git a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp --- a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp @@ -1,7 +1,8 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK1 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK2 // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -31,107 +32,442 @@ return a; } -// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] } -// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef -// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef -// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 - -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l13}}_worker() -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: call i1 @__kmpc_kernel_parallel( -// CHECK: call void @__omp_outlined___wrapper( - -// CHECK: define weak void @__omp_offloading_{{.*}}l13( -// CHECK: call void @__omp_offloading_{{.*}}l13_worker() -// CHECK: call void @__kmpc_kernel_init( -// CHECK: call void @__kmpc_data_sharing_init_stack() -// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], -// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 %7, i16 %6, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ: [[STACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 -// PAR: [[STACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) -// CHECK: call void @__kmpc_kernel_prepare_parallel( -// CHECK: call void @__kmpc_begin_sharing_variables({{.*}}, i64 2) -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: call void @__kmpc_end_sharing_variables() -// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[IS_SHARED]]) -// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[STACK]]) -// CHECK: call void @__kmpc_kernel_deinit(i16 1) - -// CHECK: define internal void @__omp_outlined__( -// CHECK: alloca -// CHECK: alloca -// CHECK: alloca -// CHECK: alloca -// CHECK: [[OMP_IV:%.*]] = alloca i32 -// CHECK: store i32 0, {{.*}} [[OMP_LB:%.+]], -// CHECK: store i32 9, {{.*}} [[OMP_UB:%.+]], -// CHECK: store i32 1, {{.*}} [[OMP_ST:%.+]], -// CHECK: call void @__kmpc_for_static_init_4({{.*}} i32 33, {{.*}} [[OMP_LB]], {{.*}} [[OMP_UB]], {{.*}} [[OMP_ST]], i32 1, i32 1) -// CHECK: br label %[[OMP_DISPATCH_COND:.+]] - -// CHECK: [[OMP_DISPATCH_COND]] -// CHECK: [[OMP_UB_1:%.+]] = load {{.*}} [[OMP_UB]] -// CHECK: [[COMP_1:%.+]] = icmp sgt {{.*}} [[OMP_UB_1]] -// CHECK: br i1 [[COMP_1]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]] - -// CHECK: [[COND_TRUE]] -// CHECK: br label %[[COND_END:.+]] - -// CHECK: [[COND_FALSE]] -// CHECK: [[OMP_UB_2:%.+]] = load {{.*}}* [[OMP_UB]] -// CHECK: br label %[[COND_END]] - -// CHECK: [[COND_END]] -// CHECK: [[COND_RES:%.+]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[OMP_UB_2]], %[[COND_FALSE]] ] -// CHECK: store i32 [[COND_RES]], i32* [[OMP_UB]] -// CHECK: [[OMP_LB_1:%.+]] = load i32, i32* [[OMP_LB]] -// CHECK: store i32 [[OMP_LB_1]], i32* [[OMP_IV]] -// CHECK: [[OMP_IV_1:%.+]] = load i32, i32* [[OMP_IV]] -// CHECK: [[OMP_UB_3:%.+]] = load i32, i32* [[OMP_UB]] -// CHECK: [[COMP_2:%.+]] = icmp sle i32 [[OMP_IV_1]], [[OMP_UB_3]] -// CHECK: br i1 [[COMP_2]], label %[[DISPATCH_BODY:.+]], label %[[DISPATCH_END:.+]] - -// CHECK: [[DISPATCH_BODY]] -// CHECK: br label %[[OMP_INNER_FOR_COND:.+]] - -// CHECK: [[OMP_INNER_FOR_COND]] -// CHECK: [[OMP_IV_2:%.+]] = load i32, i32* [[OMP_IV]] -// CHECK: [[OMP_UB_4:%.+]] = load i32, i32* [[OMP_UB]] -// CHECK: [[COMP_3:%.+]] = icmp sle i32 [[OMP_IV_2]], [[OMP_UB_4]] -// CHECK: br i1 [[COMP_3]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]] - -// CHECK: [[OMP_INNER_FOR_BODY]] -// CHECK: br label %[[OMP_BODY_CONTINUE:.+]] - -// CHECK: [[OMP_BODY_CONTINUE]] -// CHECK: br label %[[OMP_INNER_FOR_INC:.+]] - -// CHECK: [[OMP_INNER_FOR_INC]] -// CHECK: [[OMP_IV_3:%.+]] = load i32, i32* [[OMP_IV]] -// CHECK: [[ADD_1:%.+]] = add nsw i32 [[OMP_IV_3]], 1 -// CHECK: store i32 [[ADD_1]], i32* [[OMP_IV]] -// CHECK: br label %[[OMP_INNER_FOR_COND]] - -// CHECK: [[OMP_INNER_FOR_COND]] -// CHECK: br label %[[OMP_DISPATCH_INC:.+]] - -// CHECK: [[OMP_DISPATCH_INC]] -// CHECK: [[OMP_LB_2:%.+]] = load i32, i32* [[OMP_LB]] -// CHECK: [[OMP_ST_1:%.+]] = load i32, i32* [[OMP_ST]] -// CHECK: [[ADD_2:%.+]] = add nsw i32 [[OMP_LB_2]], [[OMP_ST_1]] -// CHECK: store i32 [[ADD_2]], i32* [[OMP_LB]] -// CHECK: [[OMP_UB_5:%.+]] = load i32, i32* [[OMP_UB]] -// CHECK: [[OMP_ST_2:%.+]] = load i32, i32* [[OMP_ST]] -// CHECK: [[ADD_3:%.+]] = add nsw i32 [[OMP_UB_5]], [[OMP_ST_2]] -// CHECK: store i32 [[ADD_3]], i32* [[OMP_UB]] - -// CHECK: [[DISPATCH_END]] -// CHECK: call void @__kmpc_for_static_fini( -// CHECK: ret void - #endif +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l14_worker +// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK1: .execute.fn: +// CHECK1-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .check.next: +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l14 +// CHECK1-SAME: (i64 [[N:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l14_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 +// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP7]], i16 [[TMP6]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK1-NEXT: [[TMP8:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i64 0 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to %struct._globalized_locals_ty* +// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP10]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP12]], i32* [[D]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP16:%.*]] = bitcast i32* [[D]] to i8* +// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x i32]*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP17]], i64 2) +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK1-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP19]]) +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[D]], i32** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[D_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK1: omp.dispatch.cond: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] +// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK1: omp.dispatch.body: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]] +// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP14]], [[TMP12]] +// CHECK1-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK1: omp.dispatch.inc: +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK1-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK1-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK1: omp.dispatch.end: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to [10 x i32]** +// CHECK1-NEXT: [[TMP5:%.*]] = load [10 x i32]*, [10 x i32]** [[TMP4]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 1 +// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32** +// CHECK1-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l14_worker +// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK2: .execute.fn: +// CHECK2-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .check.next: +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l14 +// CHECK2-SAME: (i64 [[N:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l14_worker() #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: [[TMP6:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to %struct._globalized_locals_ty* +// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP7]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP9]], i32* [[D]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8 +// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP13:%.*]] = bitcast i32* [[D]] to i8* +// CHECK2-NEXT: store i8* [[TMP13]], i8** [[TMP12]], align 8 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x i32]*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP14]], i64 2) +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP6]]) +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[D]], i32** [[D_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[D_ADDR]], align 8 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK2: omp.dispatch.cond: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] +// CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK2: omp.dispatch.body: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]] +// CHECK2-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[I]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP14]], [[TMP12]] +// CHECK2-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK2-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK2: omp.dispatch.inc: +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK2-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK2-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK2: omp.dispatch.end: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to [10 x i32]** +// CHECK2-NEXT: [[TMP5:%.*]] = load [10 x i32]*, [10 x i32]** [[TMP4]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 1 +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32** +// CHECK2-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// diff --git a/clang/test/OpenMP/nvptx_target_codegen.cpp b/clang/test/OpenMP/nvptx_target_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_codegen.cpp @@ -1,25 +1,15 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK1 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK3 // expected-no-diagnostics #ifndef HEADER #define HEADER -// Check that the execution mode of all 7 target regions is set to Generic Mode. -// CHECK-DAG: [[NONSPMD:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds -// CHECK-DAG: [[UNKNOWN:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, i8* getelementptr inbounds -// CHECK-DAG: {{@__omp_offloading_.+l45}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l123}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l200}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l310}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l347}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l365}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l331}}_exec_mode = weak constant i8 1 - __thread int id; int baz(int f, double &a); @@ -31,16 +21,6 @@ tx &operator[](int i) { return X; } }; -// CHECK: define weak void @__omp_offloading_{{.+}}_{{.+}}targetBar{{.+}}_l45(i32* [[PTR1:%.+]], i32** nonnull align {{[0-9]+}} dereferenceable{{.*}} [[PTR2_REF:%.+]]) -// CHECK: store i32* [[PTR1]], i32** [[PTR1_ADDR:%.+]], -// CHECK: store i32** [[PTR2_REF]], i32*** [[PTR2_REF_PTR:%.+]], -// CHECK: [[PTR2_REF:%.+]] = load i32**, i32*** [[PTR2_REF_PTR]], -// CHECK: call void @__kmpc_spmd_kernel_init( -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd() -// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}}) -// CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]], -// CHECK: call void @{{.+}}(i32* [[THREADID]], i32* %{{.+}}, i32** [[PTR1_ADDR]], i32** [[PTR2_REF]]) -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) void targetBar(int *Ptr1, int *Ptr2) { #pragma omp target map(Ptr1[:0], Ptr2) #pragma omp parallel num_threads(2) @@ -56,257 +36,20 @@ double cn[5][n]; TT d; -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l123}}_worker() -// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, -// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, -// CHECK: store i8* null, i8** [[OMP_WORK_FN]], -// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], -// CHECK: br label {{%?}}[[AWAIT_WORK:.+]] -// -// CHECK: [[AWAIT_WORK]] -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], -// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null -// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] -// -// CHECK: [[SEL_WORKERS]] -// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]], -// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 -// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] -// -// CHECK: [[EXEC_PARALLEL]] -// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] -// -// CHECK: [[TERM_PARALLEL]] -// CHECK: br label {{%?}}[[BAR_PARALLEL]] -// -// CHECK: [[BAR_PARALLEL]] -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: br label {{%?}}[[AWAIT_WORK]] -// -// CHECK: [[EXIT]] -// CHECK: ret void - -// CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l123]]() -// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]] -// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] -// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] -// -// CHECK: [[WORKER]] -// CHECK: {{call|invoke}} void [[T1]]_worker() -// CHECK: br label {{%?}}[[EXIT:.+]] -// -// CHECK: [[CHECK_MASTER]] -// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], -// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] -// -// CHECK: [[MASTER]] -// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]] -// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] -// CHECK: br label {{%?}}[[TERMINATE:.+]] -// -// CHECK: [[TERMINATE]] -// CHECK: call void @__kmpc_kernel_deinit( -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: br label {{%?}}[[EXIT]] -// -// CHECK: [[EXIT]] -// CHECK: ret void #pragma omp target { } -// CHECK-NOT: define {{.*}}void [[T2:@__omp_offloading_.+foo.+]]_worker() #pragma omp target if (0) { } -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l200}}_worker() -// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, -// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, -// CHECK: store i8* null, i8** [[OMP_WORK_FN]], -// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], -// CHECK: br label {{%?}}[[AWAIT_WORK:.+]] -// -// CHECK: [[AWAIT_WORK]] -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], -// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null -// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] -// -// CHECK: [[SEL_WORKERS]] -// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]], -// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 -// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] -// -// CHECK: [[EXEC_PARALLEL]] -// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] -// -// CHECK: [[TERM_PARALLEL]] -// CHECK: br label {{%?}}[[BAR_PARALLEL]] -// -// CHECK: [[BAR_PARALLEL]] -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: br label {{%?}}[[AWAIT_WORK]] -// -// CHECK: [[EXIT]] -// CHECK: ret void - -// CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l200]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]]) -// CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]], -// CHECK: store i[[SZ]] [[ARG1]], i[[SZ]]* [[AA_ADDR]], -// CHECK: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16* -// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]] -// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] -// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] -// -// CHECK: [[WORKER]] -// CHECK: {{call|invoke}} void [[T2]]_worker() -// CHECK: br label {{%?}}[[EXIT:.+]] -// -// CHECK: [[CHECK_MASTER]] -// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], -// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] -// -// CHECK: [[MASTER]] -// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]] -// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] -// CHECK: load i16, i16* [[AA_CADDR]], -// CHECK: br label {{%?}}[[TERMINATE:.+]] -// -// CHECK: [[TERMINATE]] -// CHECK: call void @__kmpc_kernel_deinit( -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: br label {{%?}}[[EXIT]] -// -// CHECK: [[EXIT]] -// CHECK: ret void #pragma omp target if (1) { aa += 1; aa += 2; } -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l310}}_worker() -// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, -// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, -// CHECK: store i8* null, i8** [[OMP_WORK_FN]], -// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], -// CHECK: br label {{%?}}[[AWAIT_WORK:.+]] -// -// CHECK: [[AWAIT_WORK]] -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], -// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null -// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] -// -// CHECK: [[SEL_WORKERS]] -// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]], -// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 -// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] -// -// CHECK: [[EXEC_PARALLEL]] -// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] -// -// CHECK: [[TERM_PARALLEL]] -// CHECK: br label {{%?}}[[BAR_PARALLEL]] -// -// CHECK: [[BAR_PARALLEL]] -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: br label {{%?}}[[AWAIT_WORK]] -// -// CHECK: [[EXIT]] -// CHECK: ret void - -// CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l310]](i[[SZ]] -// Create local storage for each capture. -// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] -// CHECK: [[LOCAL_B:%.+]] = alloca [10 x float]* -// CHECK: [[LOCAL_VLA1:%.+]] = alloca i[[SZ]] -// CHECK: [[LOCAL_BN:%.+]] = alloca float* -// CHECK: [[LOCAL_C:%.+]] = alloca [5 x [10 x double]]* -// CHECK: [[LOCAL_VLA2:%.+]] = alloca i[[SZ]] -// CHECK: [[LOCAL_VLA3:%.+]] = alloca i[[SZ]] -// CHECK: [[LOCAL_CN:%.+]] = alloca double* -// CHECK: [[LOCAL_D:%.+]] = alloca [[TT:%.+]]* -// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] -// CHECK-DAG: store [10 x float]* [[ARG_B:%.+]], [10 x float]** [[LOCAL_B]] -// CHECK-DAG: store i[[SZ]] [[ARG_VLA1:%.+]], i[[SZ]]* [[LOCAL_VLA1]] -// CHECK-DAG: store float* [[ARG_BN:%.+]], float** [[LOCAL_BN]] -// CHECK-DAG: store [5 x [10 x double]]* [[ARG_C:%.+]], [5 x [10 x double]]** [[LOCAL_C]] -// CHECK-DAG: store i[[SZ]] [[ARG_VLA2:%.+]], i[[SZ]]* [[LOCAL_VLA2]] -// CHECK-DAG: store i[[SZ]] [[ARG_VLA3:%.+]], i[[SZ]]* [[LOCAL_VLA3]] -// CHECK-DAG: store double* [[ARG_CN:%.+]], double** [[LOCAL_CN]] -// CHECK-DAG: store [[TT]]* [[ARG_D:%.+]], [[TT]]** [[LOCAL_D]] -// -// CHECK-64-DAG: [[REF_A:%.+]] = bitcast i64* [[LOCAL_A]] to i32* -// CHECK-DAG: [[REF_B:%.+]] = load [10 x float]*, [10 x float]** [[LOCAL_B]], -// CHECK-DAG: [[VAL_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA1]], -// CHECK-DAG: [[REF_BN:%.+]] = load float*, float** [[LOCAL_BN]], -// CHECK-DAG: [[REF_C:%.+]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[LOCAL_C]], -// CHECK-DAG: [[VAL_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA2]], -// CHECK-DAG: [[VAL_VLA3:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA3]], -// CHECK-DAG: [[REF_CN:%.+]] = load double*, double** [[LOCAL_CN]], -// CHECK-DAG: [[REF_D:%.+]] = load [[TT]]*, [[TT]]** [[LOCAL_D]], -// -// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]] -// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] -// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] -// -// CHECK: [[WORKER]] -// CHECK: {{call|invoke}} void [[T3]]_worker() -// CHECK: br label {{%?}}[[EXIT:.+]] -// -// CHECK: [[CHECK_MASTER]] -// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], -// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] -// -// CHECK: [[MASTER]] -// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]] -// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] -// -// Use captures. -// CHECK-64-DAG: load i32, i32* [[REF_A]] -// CHECK-32-DAG: load i32, i32* [[LOCAL_A]] -// CHECK-DAG: getelementptr inbounds [10 x float], [10 x float]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2 -// CHECK-DAG: getelementptr inbounds float, float* [[REF_BN]], i[[SZ]] 3 -// CHECK-DAG: getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[REF_C]], i[[SZ]] 0, i[[SZ]] 1 -// CHECK-DAG: getelementptr inbounds double, double* [[REF_CN]], i[[SZ]] %{{.+}} -// CHECK-DAG: getelementptr inbounds [[TT]], [[TT]]* [[REF_D]], i32 0, i32 0 -// -// CHECK: br label {{%?}}[[TERMINATE:.+]] -// -// CHECK: [[TERMINATE]] -// CHECK: call void @__kmpc_kernel_deinit( -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: br label {{%?}}[[EXIT]] -// -// CHECK: [[EXIT]] -// CHECK: ret void #pragma omp target if (n > 20) { a += 1; @@ -400,331 +143,2491 @@ assert(0); } -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+347}}_worker() -// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, -// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, -// CHECK: store i8* null, i8** [[OMP_WORK_FN]], -// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], -// CHECK: br label {{%?}}[[AWAIT_WORK:.+]] -// -// CHECK: [[AWAIT_WORK]] -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], -// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null -// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] -// -// CHECK: [[SEL_WORKERS]] -// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]], -// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 -// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] -// -// CHECK: [[EXEC_PARALLEL]] -// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] -// -// CHECK: [[TERM_PARALLEL]] -// CHECK: br label {{%?}}[[BAR_PARALLEL]] -// -// CHECK: [[BAR_PARALLEL]] -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: br label {{%?}}[[AWAIT_WORK]] -// -// CHECK: [[EXIT]] -// CHECK: ret void - -// CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l347]](i[[SZ]] -// Create local storage for each capture. -// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] -// CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]] -// CHECK: [[LOCAL_AAA:%.+]] = alloca i[[SZ]] -// CHECK: [[LOCAL_B:%.+]] = alloca [10 x i32]* -// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] -// CHECK-DAG: store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]] -// CHECK-DAG: store i[[SZ]] [[ARG_AAA:%.+]], i[[SZ]]* [[LOCAL_AAA]] -// CHECK-DAG: store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]] -// Store captures in the context. -// CHECK-64-DAG: [[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32* -// CHECK-DAG: [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16* -// CHECK-DAG: [[REF_AAA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AAA]] to i8* -// CHECK-DAG: [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]], -// -// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]] -// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] -// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] -// -// CHECK: [[WORKER]] -// CHECK: {{call|invoke}} void [[T4]]_worker() -// CHECK: br label {{%?}}[[EXIT:.+]] -// -// CHECK: [[CHECK_MASTER]] -// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], -// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] -// -// CHECK: [[MASTER]] -// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]] -// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] -// CHECK-64-DAG: load i32, i32* [[REF_A]] -// CHECK-32-DAG: load i32, i32* [[LOCAL_A]] -// CHECK-DAG: load i16, i16* [[REF_AA]] -// CHECK-DAG: getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2 -// CHECK: br label {{%?}}[[TERMINATE:.+]] -// -// CHECK: [[TERMINATE]] -// CHECK: call void @__kmpc_kernel_deinit( -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: br label {{%?}}[[EXIT]] -// -// CHECK: [[EXIT]] -// CHECK: ret void - -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l365}}_worker() -// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, -// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, -// CHECK: store i8* null, i8** [[OMP_WORK_FN]], -// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], -// CHECK: br label {{%?}}[[AWAIT_WORK:.+]] -// -// CHECK: [[AWAIT_WORK]] -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], -// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null -// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] -// -// CHECK: [[SEL_WORKERS]] -// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]], -// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 -// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] -// -// CHECK: [[EXEC_PARALLEL]] -// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[NONSPMD]] -// CHECK: [[WORK_FN:%.+]] = bitcast i8* [[WORK]] to void (i16, i32)* -// CHECK: call void [[WORK_FN]](i16 0, i32 [[GTID]]) -// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] -// -// CHECK: [[TERM_PARALLEL]] -// CHECK: br label {{%?}}[[BAR_PARALLEL]] -// -// CHECK: [[BAR_PARALLEL]] -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: br label {{%?}}[[AWAIT_WORK]] -// -// CHECK: [[EXIT]] -// CHECK: ret void - -// CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l365]]( -// Create local storage for each capture. -// CHECK: [[LOCAL_THIS:%.+]] = alloca [[S1:%struct.*]]* -// CHECK: [[LOCAL_B:%.+]] = alloca i[[SZ]] -// CHECK: [[LOCAL_VLA1:%.+]] = alloca i[[SZ]] -// CHECK: [[LOCAL_VLA2:%.+]] = alloca i[[SZ]] -// CHECK: [[LOCAL_C:%.+]] = alloca i16* -// CHECK-DAG: store [[S1]]* [[ARG_THIS:%.+]], [[S1]]** [[LOCAL_THIS]] -// CHECK-DAG: store i[[SZ]] [[ARG_B:%.+]], i[[SZ]]* [[LOCAL_B]] -// CHECK-DAG: store i[[SZ]] [[ARG_VLA1:%.+]], i[[SZ]]* [[LOCAL_VLA1]] -// CHECK-DAG: store i[[SZ]] [[ARG_VLA2:%.+]], i[[SZ]]* [[LOCAL_VLA2]] -// CHECK-DAG: store i16* [[ARG_C:%.+]], i16** [[LOCAL_C]] -// Store captures in the context. -// CHECK-DAG: [[REF_THIS:%.+]] = load [[S1]]*, [[S1]]** [[LOCAL_THIS]], -// CHECK-64-DAG:[[REF_B:%.+]] = bitcast i[[SZ]]* [[LOCAL_B]] to i32* -// CHECK-DAG: [[VAL_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA1]], -// CHECK-DAG: [[VAL_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA2]], -// CHECK-DAG: [[REF_C:%.+]] = load i16*, i16** [[LOCAL_C]], -// -// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]] -// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] -// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] -// -// CHECK: [[WORKER]] -// CHECK: {{call|invoke}} void [[T5]]_worker() -// CHECK: br label {{%?}}[[EXIT:.+]] -// -// CHECK: [[CHECK_MASTER]] -// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], -// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] -// -// CHECK: [[MASTER]] -// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]] -// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] -// Use captures. -// CHECK-DAG: getelementptr inbounds [[S1]], [[S1]]* [[REF_THIS]], i32 0, i32 0 -// CHECK-64-DAG:load i32, i32* [[REF_B]] -// CHECK-32-DAG:load i32, i32* [[LOCAL_B]] -// CHECK-DAG: getelementptr inbounds i16, i16* [[REF_C]], i[[SZ]] %{{.+}} -// CHECK: call i32 [[BAZ:@.*baz.*]](i32 % -// CHECK: br label {{%?}}[[TERMINATE:.+]] -// -// CHECK: [[TERMINATE]] -// CHECK: call void @__kmpc_kernel_deinit( -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: br label {{%?}}[[EXIT]] -// -// CHECK: [[EXIT]] -// CHECK: ret void - -// CHECK: define{{ hidden | }}i32 [[BAZ]](i32 [[F:%.*]], double* nonnull align {{[0-9]+}} dereferenceable{{.*}}) -// CHECK: alloca i32, -// CHECK: [[LOCAL_F_PTR:%.+]] = alloca i32, -// CHECK: [[ZERO_ADDR:%.+]] = alloca i32, -// CHECK: [[BND_ZERO_ADDR:%.+]] = alloca i32, -// CHECK: store i32 0, i32* [[BND_ZERO_ADDR]] -// CHECK: store i32 0, i32* [[ZERO_ADDR]] -// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[UNKNOWN]] -// CHECK: [[PAR_LEVEL:%.+]] = call i16 @__kmpc_parallel_level(%struct.ident_t* [[UNKNOWN]], i32 [[GTID]]) -// CHECK: [[IS_TTD:%.+]] = icmp eq i16 %1, 0 -// CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode() -// CHECK: [[IS_SPMD:%.+]] = icmp ne i8 [[RES]], 0 -// CHECK: br i1 [[IS_SPMD]], label -// CHECK: br label -// CHECK: [[SIZE:%.+]] = select i1 [[IS_TTD]], i{{64|32}} 4, i{{64|32}} 128 -// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i{{64|32}} [[SIZE]], i16 0) -// CHECK: [[REC_ADDR:%.+]] = bitcast i8* [[PTR]] to [[GLOBAL_ST:%.+]]* -// CHECK: br label -// CHECK: [[ITEMS:%.+]] = phi [[GLOBAL_ST]]* [ null, {{.+}} ], [ [[REC_ADDR]], {{.+}} ] -// CHECK: [[TTD_ITEMS:%.+]] = bitcast [[GLOBAL_ST]]* [[ITEMS]] to [[SEC_GLOBAL_ST:%.+]]* -// CHECK: [[F_PTR_ARR:%.+]] = getelementptr inbounds [[GLOBAL_ST]], [[GLOBAL_ST]]* [[ITEMS]], i32 0, i32 0 -// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK: [[LID:%.+]] = and i32 [[TID]], 31 -// CHECK: [[GLOBAL_F_PTR_PAR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[F_PTR_ARR]], i32 0, i32 [[LID]] -// CHECK: [[GLOBAL_F_PTR_TTD:%.+]] = getelementptr inbounds [[SEC_GLOBAL_ST]], [[SEC_GLOBAL_ST]]* [[TTD_ITEMS]], i32 0, i32 0 -// CHECK: [[GLOBAL_F_PTR:%.+]] = select i1 [[IS_TTD]], i32* [[GLOBAL_F_PTR_TTD]], i32* [[GLOBAL_F_PTR_PAR]] -// CHECK: [[F_PTR:%.+]] = select i1 [[IS_SPMD]], i32* [[LOCAL_F_PTR]], i32* [[GLOBAL_F_PTR]] -// CHECK: store i32 %{{.+}}, i32* [[F_PTR]], - -// CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode() -// CHECK: icmp ne i8 [[RES]], 0 -// CHECK: br i1 - -// CHECK: [[RES:%.+]] = call i16 @__kmpc_parallel_level(%struct.ident_t* [[UNKNOWN]], i32 [[GTID]]) -// CHECK: icmp ne i16 [[RES]], 0 -// CHECK: br i1 - -// CHECK: call void @__kmpc_serialized_parallel(%struct.ident_t* [[UNKNOWN]], i32 [[GTID]]) -// CHECK: call void [[OUTLINED:@.+]](i32* [[ZERO_ADDR]], i32* [[BND_ZERO_ADDR]], i32* [[F_PTR]], double* %{{.+}}) -// CHECK: call void @__kmpc_end_serialized_parallel(%struct.ident_t* [[UNKNOWN]], i32 [[GTID]]) -// CHECK: br label - -// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @{{.+}} to i8*)) -// CHECK: call void @__kmpc_begin_sharing_variables(i8*** [[SHARED_PTR:%.+]], i{{64|32}} 2) -// CHECK: [[SHARED:%.+]] = load i8**, i8*** [[SHARED_PTR]], -// CHECK: [[REF:%.+]] = getelementptr inbounds i8*, i8** [[SHARED]], i{{64|32}} 0 -// CHECK: [[F_REF:%.+]] = bitcast i32* [[F_PTR]] to i8* -// CHECK: store i8* [[F_REF]], i8** [[REF]], -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: call void @__kmpc_end_sharing_variables() -// CHECK: br label - -// CHECK: [[RES:%.+]] = load i32, i32* [[F_PTR]], -// CHECK: store i32 [[RES]], i32* [[RET:%.+]], -// CHECK: br i1 [[IS_SPMD]], label -// CHECK: [[BC:%.+]] = bitcast [[GLOBAL_ST]]* [[ITEMS]] to i8* -// CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[BC]]) -// CHECK: br label -// CHECK: [[RES:%.+]] = load i32, i32* [[RET]], -// CHECK: ret i32 [[RES]] - -// CHECK: define {{.*}}void {{@__omp_offloading_.+unreachable_call.+l399}}() -// CHECK: call void @{{.*}}assert{{.*}}(i32 0) -// CHECK: unreachable -// CHECK: call void @__kmpc_kernel_deinit(i16 1) -// CHECK: ret void - -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l331}}_worker() -// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, -// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, -// CHECK: store i8* null, i8** [[OMP_WORK_FN]], -// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], -// CHECK: br label {{%?}}[[AWAIT_WORK:.+]] -// -// CHECK: [[AWAIT_WORK]] -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], -// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null -// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] -// -// CHECK: [[SEL_WORKERS]] -// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]], -// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 -// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] -// -// CHECK: [[EXEC_PARALLEL]] -// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] -// -// CHECK: [[TERM_PARALLEL]] -// CHECK: br label {{%?}}[[BAR_PARALLEL]] -// -// CHECK: [[BAR_PARALLEL]] -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: br label {{%?}}[[AWAIT_WORK]] -// -// CHECK: [[EXIT]] -// CHECK: ret void - -// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l331]](i[[SZ]] -// Create local storage for each capture. -// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] -// CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]] -// CHECK: [[LOCAL_B:%.+]] = alloca [10 x i32]* -// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] -// CHECK-DAG: store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]] -// CHECK-DAG: store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]] -// Store captures in the context. -// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32* -// CHECK-DAG: [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16* -// CHECK-DAG: [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]], -// -// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]] -// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] -// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] -// -// CHECK: [[WORKER]] -// CHECK: {{call|invoke}} void [[T6]]_worker() -// CHECK: br label {{%?}}[[EXIT:.+]] -// -// CHECK: [[CHECK_MASTER]] -// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], -// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] -// -// CHECK: [[MASTER]] -// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]] -// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] -// -// CHECK-64-DAG: load i32, i32* [[REF_A]] -// CHECK-32-DAG: load i32, i32* [[LOCAL_A]] -// CHECK-DAG: load i16, i16* [[REF_AA]] -// CHECK-DAG: getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2 -// -// CHECK: br label {{%?}}[[TERMINATE:.+]] -// -// CHECK: [[TERMINATE]] -// CHECK: call void @__kmpc_kernel_deinit( -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: br label {{%?}}[[EXIT]] -// -// CHECK: [[EXIT]] -// CHECK: ret void - #endif +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9targetBarPiS__l25 +// CHECK1-SAME: (i32* [[PTR1:%.*]], i32** nonnull align 8 dereferenceable(8) [[PTR2:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[PTR1_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[PTR2_ADDR:%.*]] = alloca i32**, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i32* [[PTR1]], i32** [[PTR1_ADDR]], align 8 +// CHECK1-NEXT: store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i32** [[PTR1_ADDR]] to i8* +// CHECK1-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i32** [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP6]], i64 2) +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32** nonnull align 8 dereferenceable(8) [[PTR1:%.*]], i32** nonnull align 8 dereferenceable(8) [[PTR2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[PTR1_ADDR:%.*]] = alloca i32**, align 8 +// CHECK1-NEXT: [[PTR2_ADDR:%.*]] = alloca i32**, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32** [[PTR1]], i32*** [[PTR1_ADDR]], align 8 +// CHECK1-NEXT: store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR1_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[TMP1]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP0]], align 8 +// CHECK1-NEXT: store i32 [[TMP3]], i32* [[TMP4]], align 4 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_worker +// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39 +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_worker() #[[ATTR2:[0-9]+]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_worker +// CHECK1-SAME: () #[[ATTR3]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47 +// CHECK1-SAME: (i64 [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16* +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_worker() #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK1-NEXT: [[TMP5:%.*]] = load i16, i16* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV7:%.*]] = sext i16 [[TMP5]] to i32 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV7]], 1 +// CHECK1-NEXT: [[CONV8:%.*]] = trunc i32 [[ADD]] to i16 +// CHECK1-NEXT: store i16 [[CONV8]], i16* [[CONV]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV9:%.*]] = sext i16 [[TMP6]] to i32 +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], 2 +// CHECK1-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16 +// CHECK1-NEXT: store i16 [[CONV11]], i16* [[CONV]], align 8 +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_worker +// CHECK1-SAME: () #[[ATTR3]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53 +// CHECK1-SAME: (i64 [[A:%.*]], [10 x float]* nonnull align 4 dereferenceable(40) [[B:%.*]], i64 [[VLA:%.*]], float* nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* nonnull align 8 dereferenceable(400) [[C:%.*]], i64 [[VLA1:%.*]], i64 [[VLA3:%.*]], double* nonnull align 8 dereferenceable(8) [[CN:%.*]], %struct.TT* nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x float]*, align 8 +// CHECK1-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[BN_ADDR:%.*]] = alloca float*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 8 +// CHECK1-NEXT: [[VLA_ADDR2:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[VLA_ADDR4:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CN_ADDR:%.*]] = alloca double*, align 8 +// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca %struct.TT*, align 8 +// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK1-NEXT: store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[VLA]], i64* [[VLA_ADDR]], align 8 +// CHECK1-NEXT: store float* [[BN]], float** [[BN_ADDR]], align 8 +// CHECK1-NEXT: store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8 +// CHECK1-NEXT: store i64 [[VLA3]], i64* [[VLA_ADDR4]], align 8 +// CHECK1-NEXT: store double* [[CN]], double** [[CN_ADDR]], align 8 +// CHECK1-NEXT: store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[VLA_ADDR4]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP8:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP8]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_worker() #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP9:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE7]], 1 +// CHECK1-NEXT: [[TMP10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], 1 +// CHECK1-NEXT: [[TMP11:%.*]] = xor i32 [[TMP9]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP10]], [[TMP11]] +// CHECK1-NEXT: [[TMP12:%.*]] = icmp eq i32 [[NVPTX_TID5]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP12]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS8:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE9:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS8]], [[NVPTX_WARP_SIZE9]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT10]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[TMP0]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[CONV11:%.*]] = fpext float [[TMP14]] to double +// CHECK1-NEXT: [[ADD12:%.*]] = fadd double [[CONV11]], 1.000000e+00 +// CHECK1-NEXT: [[CONV13:%.*]] = fptrunc double [[ADD12]] to float +// CHECK1-NEXT: store float [[CONV13]], float* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 3 +// CHECK1-NEXT: [[TMP15:%.*]] = load float, float* [[ARRAYIDX14]], align 4 +// CHECK1-NEXT: [[CONV15:%.*]] = fpext float [[TMP15]] to double +// CHECK1-NEXT: [[ADD16:%.*]] = fadd double [[CONV15]], 1.000000e+00 +// CHECK1-NEXT: [[CONV17:%.*]] = fptrunc double [[ADD16]] to float +// CHECK1-NEXT: store float [[CONV17]], float* [[ARRAYIDX14]], align 4 +// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[TMP3]], i64 0, i64 1 +// CHECK1-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX18]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP16:%.*]] = load double, double* [[ARRAYIDX19]], align 8 +// CHECK1-NEXT: [[ADD20:%.*]] = fadd double [[TMP16]], 1.000000e+00 +// CHECK1-NEXT: store double [[ADD20]], double* [[ARRAYIDX19]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = mul nsw i64 1, [[TMP5]] +// CHECK1-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds double, double* [[TMP6]], i64 [[TMP17]] +// CHECK1-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX21]], i64 3 +// CHECK1-NEXT: [[TMP18:%.*]] = load double, double* [[ARRAYIDX22]], align 8 +// CHECK1-NEXT: [[ADD23:%.*]] = fadd double [[TMP18]], 1.000000e+00 +// CHECK1-NEXT: store double [[ADD23]], double* [[ARRAYIDX22]], align 8 +// CHECK1-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], %struct.TT* [[TMP7]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[X]], align 8 +// CHECK1-NEXT: [[ADD24:%.*]] = add nsw i64 [[TMP19]], 1 +// CHECK1-NEXT: store i64 [[ADD24]], i64* [[X]], align 8 +// CHECK1-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[TMP7]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP20:%.*]] = load i8, i8* [[Y]], align 8 +// CHECK1-NEXT: [[CONV25:%.*]] = sext i8 [[TMP20]] to i32 +// CHECK1-NEXT: [[ADD26:%.*]] = add nsw i32 [[CONV25]], 1 +// CHECK1-NEXT: [[CONV27:%.*]] = trunc i32 [[ADD26]] to i8 +// CHECK1-NEXT: store i8 [[CONV27]], i8* [[Y]], align 8 +// CHECK1-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull dereferenceable(16) [[TMP7]], i32 0) #[[ATTR7:[0-9]+]] +// CHECK1-NEXT: [[TMP21:%.*]] = load i64, i64* [[CALL]], align 8 +// CHECK1-NEXT: [[ADD28:%.*]] = add nsw i64 [[TMP21]], 1 +// CHECK1-NEXT: store i64 [[ADD28]], i64* [[CALL]], align 8 +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_ZN2TTIxcEixEi +// CHECK1-SAME: (%struct.TT* nonnull dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR5:[0-9]+]] comdat align 2 { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.TT*, align 8 +// CHECK1-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store %struct.TT* [[THIS]], %struct.TT** [[THIS_ADDR]], align 8 +// CHECK1-NEXT: store i32 [[I]], i32* [[I_ADDR]], align 4 +// CHECK1-NEXT: [[THIS1:%.*]] = load %struct.TT*, %struct.TT** [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], %struct.TT* [[THIS1]], i32 0, i32 0 +// CHECK1-NEXT: ret i64* [[X]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_worker +// CHECK1-SAME: () #[[ATTR3]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90 +// CHECK1-SAME: (i64 [[A:%.*]], i64 [[AA:%.*]], i64 [[AAA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AAA_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[AAA]], i64* [[AAA_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[AA_ADDR]] to i16* +// CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[AAA_ADDR]] to i8* +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_worker() #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 +// CHECK1-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 +// CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[CONV1]], align 8 +// CHECK1-NEXT: [[CONV9:%.*]] = sext i16 [[TMP7]] to i32 +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], 1 +// CHECK1-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16 +// CHECK1-NEXT: store i16 [[CONV11]], i16* [[CONV1]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i8, i8* [[CONV2]], align 8 +// CHECK1-NEXT: [[CONV12:%.*]] = sext i8 [[TMP8]] to i32 +// CHECK1-NEXT: [[ADD13:%.*]] = add nsw i32 [[CONV12]], 1 +// CHECK1-NEXT: [[CONV14:%.*]] = trunc i32 [[ADD13]] to i8 +// CHECK1-NEXT: store i8 [[CONV14]], i8* [[CONV2]], align 8 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP9]], 1 +// CHECK1-NEXT: store i32 [[ADD15]], i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_worker +// CHECK1-SAME: () #[[ATTR3]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108 +// CHECK1-SAME: (%struct.S1* [[THIS:%.*]], i64 [[B:%.*]], i64 [[VLA:%.*]], i64 [[VLA1:%.*]], i16* nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[VLA_ADDR2:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i16*, align 8 +// CHECK1-NEXT: store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[VLA]], i64* [[VLA_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8 +// CHECK1-NEXT: store i16* [[C]], i16** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[B_ADDR]] to i32* +// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_worker() #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP5:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 +// CHECK1-NEXT: [[TMP6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 +// CHECK1-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP8:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP8]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV9:%.*]] = sitofp i32 [[TMP9]] to double +// CHECK1-NEXT: [[ADD:%.*]] = fadd double [[CONV9]], 1.500000e+00 +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0 +// CHECK1-NEXT: store double [[ADD]], double* [[A]], align 8 +// CHECK1-NEXT: [[A10:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP10:%.*]] = load double, double* [[A10]], align 8 +// CHECK1-NEXT: [[INC:%.*]] = fadd double [[TMP10]], 1.000000e+00 +// CHECK1-NEXT: store double [[INC]], double* [[A10]], align 8 +// CHECK1-NEXT: [[CONV11:%.*]] = fptosi double [[INC]] to i16 +// CHECK1-NEXT: [[TMP11:%.*]] = mul nsw i64 1, [[TMP2]] +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i64 [[TMP11]] +// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1 +// CHECK1-NEXT: store i16 [[CONV11]], i16* [[ARRAYIDX12]], align 2 +// CHECK1-NEXT: [[A13:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP12:%.*]] = load double, double* [[A13]], align 8 +// CHECK1-NEXT: [[CONV14:%.*]] = fptosi double [[TMP12]] to i32 +// CHECK1-NEXT: [[A15:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0 +// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV14]], double* nonnull align 8 dereferenceable(8) [[A15]]) #[[ATTR7]] +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_Z3baziRd +// CHECK1-SAME: (i32 [[F3:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[F2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca double*, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK1-NEXT: [[TMP1:%.*]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]]) +// CHECK1-NEXT: [[TMP2:%.*]] = icmp eq i16 [[TMP1]], 0 +// CHECK1-NEXT: [[TMP3:%.*]] = call i8 @__kmpc_is_spmd_exec_mode() #[[ATTR2]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTSPMD:%.*]], label [[DOTNON_SPMD:%.*]] +// CHECK1: .spmd: +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .non-spmd: +// CHECK1-NEXT: [[TMP5:%.*]] = select i1 [[TMP2]], i64 4, i64 128 +// CHECK1-NEXT: [[TMP6:%.*]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 [[TMP5]], i16 0) +// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to %struct._globalized_locals_ty* +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: [[_SELECT_STACK:%.*]] = phi %struct._globalized_locals_ty* [ null, [[DOTSPMD]] ], [ [[TMP7]], [[DOTNON_SPMD]] ] +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to %struct._globalized_locals_ty.0* +// CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[_SELECT_STACK]], i32 0, i32 0 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID]], 31 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* [[F]], i32 0, i32 [[NVPTX_LANE_ID]] +// CHECK1-NEXT: [[F1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP8]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP10:%.*]] = select i1 [[TMP2]], i32* [[F1]], i32* [[TMP9]] +// CHECK1-NEXT: [[TMP11:%.*]] = select i1 [[TMP4]], i32* [[F2]], i32* [[TMP10]] +// CHECK1-NEXT: store i32 [[F3]], i32* [[TMP11]], align 4 +// CHECK1-NEXT: store double* [[A]], double** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP12:%.*]] = load double*, double** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP11]] to i8* +// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP12]] to i8* +// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP17]], i64 2) +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP11]], align 4 +// CHECK1-NEXT: store i32 [[TMP18]], i32* [[RETVAL]], align 4 +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTEXIT5:%.*]], label [[DOTNON_SPMD4:%.*]] +// CHECK1: .non-spmd4: +// CHECK1-NEXT: [[TMP19:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to i8* +// CHECK1-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP19]]) +// CHECK1-NEXT: br label [[DOTEXIT5]] +// CHECK1: .exit5: +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[RETVAL]], align 4 +// CHECK1-NEXT: ret i32 [[TMP20]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[F:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca double*, align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca double*, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[F]], i32** [[F_ADDR]], align 8 +// CHECK1-NEXT: store double* [[A]], double** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[F_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 8 +// CHECK1-NEXT: store double* [[TMP1]], double** [[TMP]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load double*, double** [[TMP]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load double, double* [[TMP2]], align 8 +// CHECK1-NEXT: [[ADD:%.*]] = fadd double 2.000000e+00, [[TMP3]] +// CHECK1-NEXT: [[CONV:%.*]] = fptosi double [[ADD]] to i32 +// CHECK1-NEXT: store i32 [[CONV]], i32* [[TMP0]], align 4 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR3]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 1 +// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to double** +// CHECK1-NEXT: [[TMP8:%.*]] = load double*, double** [[TMP7]], align 8 +// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], double* [[TMP8]]) #[[ATTR2]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker +// CHECK1-SAME: () #[[ATTR3]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142 +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker() #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK1-NEXT: call void @_Z6asserti(i32 0) #[[ATTR8:[0-9]+]] +// CHECK1-NEXT: unreachable +// CHECK1: 5: +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_worker +// CHECK1-SAME: () #[[ATTR3]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK1-SAME: (i64 [[A:%.*]], i64 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[AA_ADDR]] to i16* +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_worker() #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK1-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 +// CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[CONV1]], align 8 +// CHECK1-NEXT: [[CONV8:%.*]] = sext i16 [[TMP7]] to i32 +// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK1-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK1-NEXT: store i16 [[CONV10]], i16* [[CONV1]], align 8 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP8]], 1 +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9targetBarPiS__l25 +// CHECK2-SAME: (i32* [[PTR1:%.*]], i32** nonnull align 4 dereferenceable(4) [[PTR2:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[PTR1_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[PTR2_ADDR:%.*]] = alloca i32**, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: store i32* [[PTR1]], i32** [[PTR1_ADDR]], align 4 +// CHECK2-NEXT: store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK2-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP3:%.*]] = bitcast i32** [[PTR1_ADDR]] to i8* +// CHECK2-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i32** [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP6]], i32 2) +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32** nonnull align 4 dereferenceable(4) [[PTR1:%.*]], i32** nonnull align 4 dereferenceable(4) [[PTR2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[PTR1_ADDR:%.*]] = alloca i32**, align 4 +// CHECK2-NEXT: [[PTR2_ADDR:%.*]] = alloca i32**, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32** [[PTR1]], i32*** [[PTR1_ADDR]], align 4 +// CHECK2-NEXT: store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR1_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[TMP1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP0]], align 4 +// CHECK2-NEXT: store i32 [[TMP3]], i32* [[TMP4]], align 4 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_worker +// CHECK2-SAME: () #[[ATTR3:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39 +// CHECK2-SAME: () #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_worker() #[[ATTR2:[0-9]+]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_worker +// CHECK2-SAME: () #[[ATTR3]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47 +// CHECK2-SAME: (i32 [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_worker() #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: [[TMP5:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK2-NEXT: [[CONV7:%.*]] = sext i16 [[TMP5]] to i32 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV7]], 1 +// CHECK2-NEXT: [[CONV8:%.*]] = trunc i32 [[ADD]] to i16 +// CHECK2-NEXT: store i16 [[CONV8]], i16* [[CONV]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK2-NEXT: [[CONV9:%.*]] = sext i16 [[TMP6]] to i32 +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], 2 +// CHECK2-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16 +// CHECK2-NEXT: store i16 [[CONV11]], i16* [[CONV]], align 4 +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_worker +// CHECK2-SAME: () #[[ATTR3]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53 +// CHECK2-SAME: (i32 [[A:%.*]], [10 x float]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32 [[VLA:%.*]], float* nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* nonnull align 8 dereferenceable(400) [[C:%.*]], i32 [[VLA1:%.*]], i32 [[VLA3:%.*]], double* nonnull align 8 dereferenceable(8) [[CN:%.*]], %struct.TT* nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x float]*, align 4 +// CHECK2-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[BN_ADDR:%.*]] = alloca float*, align 4 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 4 +// CHECK2-NEXT: [[VLA_ADDR2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[VLA_ADDR4:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CN_ADDR:%.*]] = alloca double*, align 4 +// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca %struct.TT*, align 4 +// CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[VLA]], i32* [[VLA_ADDR]], align 4 +// CHECK2-NEXT: store float* [[BN]], float** [[BN_ADDR]], align 4 +// CHECK2-NEXT: store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4 +// CHECK2-NEXT: store i32 [[VLA3]], i32* [[VLA_ADDR4]], align 4 +// CHECK2-NEXT: store double* [[CN]], double** [[CN_ADDR]], align 4 +// CHECK2-NEXT: store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[VLA_ADDR4]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP8:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP8]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_worker() #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP9:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE7]], 1 +// CHECK2-NEXT: [[TMP10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], 1 +// CHECK2-NEXT: [[TMP11:%.*]] = xor i32 [[TMP9]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP10]], [[TMP11]] +// CHECK2-NEXT: [[TMP12:%.*]] = icmp eq i32 [[NVPTX_TID5]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP12]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS8:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE9:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS8]], [[NVPTX_WARP_SIZE9]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT10]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[TMP0]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[CONV:%.*]] = fpext float [[TMP14]] to double +// CHECK2-NEXT: [[ADD11:%.*]] = fadd double [[CONV]], 1.000000e+00 +// CHECK2-NEXT: [[CONV12:%.*]] = fptrunc double [[ADD11]] to float +// CHECK2-NEXT: store float [[CONV12]], float* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 3 +// CHECK2-NEXT: [[TMP15:%.*]] = load float, float* [[ARRAYIDX13]], align 4 +// CHECK2-NEXT: [[CONV14:%.*]] = fpext float [[TMP15]] to double +// CHECK2-NEXT: [[ADD15:%.*]] = fadd double [[CONV14]], 1.000000e+00 +// CHECK2-NEXT: [[CONV16:%.*]] = fptrunc double [[ADD15]] to float +// CHECK2-NEXT: store float [[CONV16]], float* [[ARRAYIDX13]], align 4 +// CHECK2-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[TMP3]], i32 0, i32 1 +// CHECK2-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX17]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP16:%.*]] = load double, double* [[ARRAYIDX18]], align 8 +// CHECK2-NEXT: [[ADD19:%.*]] = fadd double [[TMP16]], 1.000000e+00 +// CHECK2-NEXT: store double [[ADD19]], double* [[ARRAYIDX18]], align 8 +// CHECK2-NEXT: [[TMP17:%.*]] = mul nsw i32 1, [[TMP5]] +// CHECK2-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 [[TMP17]] +// CHECK2-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX20]], i32 3 +// CHECK2-NEXT: [[TMP18:%.*]] = load double, double* [[ARRAYIDX21]], align 8 +// CHECK2-NEXT: [[ADD22:%.*]] = fadd double [[TMP18]], 1.000000e+00 +// CHECK2-NEXT: store double [[ADD22]], double* [[ARRAYIDX21]], align 8 +// CHECK2-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], %struct.TT* [[TMP7]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP19:%.*]] = load i64, i64* [[X]], align 8 +// CHECK2-NEXT: [[ADD23:%.*]] = add nsw i64 [[TMP19]], 1 +// CHECK2-NEXT: store i64 [[ADD23]], i64* [[X]], align 8 +// CHECK2-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[TMP7]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP20:%.*]] = load i8, i8* [[Y]], align 8 +// CHECK2-NEXT: [[CONV24:%.*]] = sext i8 [[TMP20]] to i32 +// CHECK2-NEXT: [[ADD25:%.*]] = add nsw i32 [[CONV24]], 1 +// CHECK2-NEXT: [[CONV26:%.*]] = trunc i32 [[ADD25]] to i8 +// CHECK2-NEXT: store i8 [[CONV26]], i8* [[Y]], align 8 +// CHECK2-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull dereferenceable(16) [[TMP7]], i32 0) #[[ATTR7:[0-9]+]] +// CHECK2-NEXT: [[TMP21:%.*]] = load i64, i64* [[CALL]], align 8 +// CHECK2-NEXT: [[ADD27:%.*]] = add nsw i64 [[TMP21]], 1 +// CHECK2-NEXT: store i64 [[ADD27]], i64* [[CALL]], align 8 +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_ZN2TTIxcEixEi +// CHECK2-SAME: (%struct.TT* nonnull dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR5:[0-9]+]] comdat align 2 { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.TT*, align 4 +// CHECK2-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store %struct.TT* [[THIS]], %struct.TT** [[THIS_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[I]], i32* [[I_ADDR]], align 4 +// CHECK2-NEXT: [[THIS1:%.*]] = load %struct.TT*, %struct.TT** [[THIS_ADDR]], align 4 +// CHECK2-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], %struct.TT* [[THIS1]], i32 0, i32 0 +// CHECK2-NEXT: ret i64* [[X]] +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_worker +// CHECK2-SAME: () #[[ATTR3]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90 +// CHECK2-SAME: (i32 [[A:%.*]], i32 [[AA:%.*]], i32 [[AAA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[AAA_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[AAA]], i32* [[AAA_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i32* [[AAA_ADDR]] to i8* +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_worker() #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK2-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 +// CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK2-NEXT: [[CONV8:%.*]] = sext i16 [[TMP7]] to i32 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK2-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK2-NEXT: store i16 [[CONV10]], i16* [[CONV]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i8, i8* [[CONV1]], align 4 +// CHECK2-NEXT: [[CONV11:%.*]] = sext i8 [[TMP8]] to i32 +// CHECK2-NEXT: [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1 +// CHECK2-NEXT: [[CONV13:%.*]] = trunc i32 [[ADD12]] to i8 +// CHECK2-NEXT: store i8 [[CONV13]], i8* [[CONV1]], align 4 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP9]], 1 +// CHECK2-NEXT: store i32 [[ADD14]], i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_worker +// CHECK2-SAME: () #[[ATTR3]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108 +// CHECK2-SAME: (%struct.S1* [[THIS:%.*]], i32 [[B:%.*]], i32 [[VLA:%.*]], i32 [[VLA1:%.*]], i16* nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[VLA_ADDR2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i16*, align 4 +// CHECK2-NEXT: store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[VLA]], i32* [[VLA_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4 +// CHECK2-NEXT: store i16* [[C]], i16** [[C_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_worker() #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP5:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 +// CHECK2-NEXT: [[TMP6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 +// CHECK2-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP8:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP8]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[B_ADDR]], align 4 +// CHECK2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP9]] to double +// CHECK2-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00 +// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0 +// CHECK2-NEXT: store double [[ADD]], double* [[A]], align 8 +// CHECK2-NEXT: [[A9:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = load double, double* [[A9]], align 8 +// CHECK2-NEXT: [[INC:%.*]] = fadd double [[TMP10]], 1.000000e+00 +// CHECK2-NEXT: store double [[INC]], double* [[A9]], align 8 +// CHECK2-NEXT: [[CONV10:%.*]] = fptosi double [[INC]] to i16 +// CHECK2-NEXT: [[TMP11:%.*]] = mul nsw i32 1, [[TMP2]] +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 [[TMP11]] +// CHECK2-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1 +// CHECK2-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX11]], align 2 +// CHECK2-NEXT: [[A12:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP12:%.*]] = load double, double* [[A12]], align 8 +// CHECK2-NEXT: [[CONV13:%.*]] = fptosi double [[TMP12]] to i32 +// CHECK2-NEXT: [[A14:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0 +// CHECK2-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV13]], double* nonnull align 8 dereferenceable(8) [[A14]]) #[[ATTR7]] +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_Z3baziRd +// CHECK2-SAME: (i32 [[F3:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[F2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK2-NEXT: [[TMP1:%.*]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]]) +// CHECK2-NEXT: [[TMP2:%.*]] = icmp eq i16 [[TMP1]], 0 +// CHECK2-NEXT: [[TMP3:%.*]] = call i8 @__kmpc_is_spmd_exec_mode() #[[ATTR2]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTSPMD:%.*]], label [[DOTNON_SPMD:%.*]] +// CHECK2: .spmd: +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .non-spmd: +// CHECK2-NEXT: [[TMP5:%.*]] = select i1 [[TMP2]], i32 4, i32 128 +// CHECK2-NEXT: [[TMP6:%.*]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i32 [[TMP5]], i16 0) +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to %struct._globalized_locals_ty* +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: [[_SELECT_STACK:%.*]] = phi %struct._globalized_locals_ty* [ null, [[DOTSPMD]] ], [ [[TMP7]], [[DOTNON_SPMD]] ] +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to %struct._globalized_locals_ty.0* +// CHECK2-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[_SELECT_STACK]], i32 0, i32 0 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID]], 31 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* [[F]], i32 0, i32 [[NVPTX_LANE_ID]] +// CHECK2-NEXT: [[F1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP8]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = select i1 [[TMP2]], i32* [[F1]], i32* [[TMP9]] +// CHECK2-NEXT: [[TMP11:%.*]] = select i1 [[TMP4]], i32* [[F2]], i32* [[TMP10]] +// CHECK2-NEXT: store i32 [[F3]], i32* [[TMP11]], align 4 +// CHECK2-NEXT: store double* [[A]], double** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load double*, double** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP11]] to i8* +// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP12]] to i8* +// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP17]], i32 2) +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP11]], align 4 +// CHECK2-NEXT: store i32 [[TMP18]], i32* [[RETVAL]], align 4 +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTEXIT5:%.*]], label [[DOTNON_SPMD4:%.*]] +// CHECK2: .non-spmd4: +// CHECK2-NEXT: [[TMP19:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to i8* +// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP19]]) +// CHECK2-NEXT: br label [[DOTEXIT5]] +// CHECK2: .exit5: +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[RETVAL]], align 4 +// CHECK2-NEXT: ret i32 [[TMP20]] +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[F:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca double*, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[F]], i32** [[F_ADDR]], align 4 +// CHECK2-NEXT: store double* [[A]], double** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[F_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 4 +// CHECK2-NEXT: store double* [[TMP1]], double** [[TMP]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load double*, double** [[TMP]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load double, double* [[TMP2]], align 8 +// CHECK2-NEXT: [[ADD:%.*]] = fadd double 2.000000e+00, [[TMP3]] +// CHECK2-NEXT: [[CONV:%.*]] = fptosi double [[ADD]] to i32 +// CHECK2-NEXT: store i32 [[CONV]], i32* [[TMP0]], align 4 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR3]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 1 +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to double** +// CHECK2-NEXT: [[TMP8:%.*]] = load double*, double** [[TMP7]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], double* [[TMP8]]) #[[ATTR2]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker +// CHECK2-SAME: () #[[ATTR3]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142 +// CHECK2-SAME: () #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker() #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: call void @_Z6asserti(i32 0) #[[ATTR8:[0-9]+]] +// CHECK2-NEXT: unreachable +// CHECK2: 5: +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_worker +// CHECK2-SAME: () #[[ATTR3]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK2-SAME: (i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_worker() #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 +// CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK2-NEXT: [[CONV7:%.*]] = sext i16 [[TMP7]] to i32 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 +// CHECK2-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 +// CHECK2-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP8]], 1 +// CHECK2-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9targetBarPiS__l25 +// CHECK3-SAME: (i32* [[PTR1:%.*]], i32** nonnull align 4 dereferenceable(4) [[PTR2:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[PTR1_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[PTR2_ADDR:%.*]] = alloca i32**, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i32* [[PTR1]], i32** [[PTR1_ADDR]], align 4 +// CHECK3-NEXT: store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP3:%.*]] = bitcast i32** [[PTR1_ADDR]] to i8* +// CHECK3-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i32** [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP6]], i32 2) +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32** nonnull align 4 dereferenceable(4) [[PTR1:%.*]], i32** nonnull align 4 dereferenceable(4) [[PTR2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[PTR1_ADDR:%.*]] = alloca i32**, align 4 +// CHECK3-NEXT: [[PTR2_ADDR:%.*]] = alloca i32**, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32** [[PTR1]], i32*** [[PTR1_ADDR]], align 4 +// CHECK3-NEXT: store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR1_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32*, i32** [[TMP1]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP0]], align 4 +// CHECK3-NEXT: store i32 [[TMP3]], i32* [[TMP4]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_worker +// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39 +// CHECK3-SAME: () #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_worker() #[[ATTR2:[0-9]+]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_worker +// CHECK3-SAME: () #[[ATTR3]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47 +// CHECK3-SAME: (i32 [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_worker() #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK3-NEXT: [[TMP5:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK3-NEXT: [[CONV7:%.*]] = sext i16 [[TMP5]] to i32 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV7]], 1 +// CHECK3-NEXT: [[CONV8:%.*]] = trunc i32 [[ADD]] to i16 +// CHECK3-NEXT: store i16 [[CONV8]], i16* [[CONV]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK3-NEXT: [[CONV9:%.*]] = sext i16 [[TMP6]] to i32 +// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], 2 +// CHECK3-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16 +// CHECK3-NEXT: store i16 [[CONV11]], i16* [[CONV]], align 4 +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_worker +// CHECK3-SAME: () #[[ATTR3]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53 +// CHECK3-SAME: (i32 [[A:%.*]], [10 x float]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32 [[VLA:%.*]], float* nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* nonnull align 8 dereferenceable(400) [[C:%.*]], i32 [[VLA1:%.*]], i32 [[VLA3:%.*]], double* nonnull align 8 dereferenceable(8) [[CN:%.*]], %struct.TT* nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x float]*, align 4 +// CHECK3-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[BN_ADDR:%.*]] = alloca float*, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 4 +// CHECK3-NEXT: [[VLA_ADDR2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[VLA_ADDR4:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CN_ADDR:%.*]] = alloca double*, align 4 +// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca %struct.TT*, align 4 +// CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[VLA]], i32* [[VLA_ADDR]], align 4 +// CHECK3-NEXT: store float* [[BN]], float** [[BN_ADDR]], align 4 +// CHECK3-NEXT: store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4 +// CHECK3-NEXT: store i32 [[VLA3]], i32* [[VLA_ADDR4]], align 4 +// CHECK3-NEXT: store double* [[CN]], double** [[CN_ADDR]], align 4 +// CHECK3-NEXT: store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[VLA_ADDR4]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP8:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP8]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_worker() #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP9:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE7]], 1 +// CHECK3-NEXT: [[TMP10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], 1 +// CHECK3-NEXT: [[TMP11:%.*]] = xor i32 [[TMP9]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP10]], [[TMP11]] +// CHECK3-NEXT: [[TMP12:%.*]] = icmp eq i32 [[NVPTX_TID5]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP12]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS8:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE9:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS8]], [[NVPTX_WARP_SIZE9]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT10]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[TMP0]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = fpext float [[TMP14]] to double +// CHECK3-NEXT: [[ADD11:%.*]] = fadd double [[CONV]], 1.000000e+00 +// CHECK3-NEXT: [[CONV12:%.*]] = fptrunc double [[ADD11]] to float +// CHECK3-NEXT: store float [[CONV12]], float* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 3 +// CHECK3-NEXT: [[TMP15:%.*]] = load float, float* [[ARRAYIDX13]], align 4 +// CHECK3-NEXT: [[CONV14:%.*]] = fpext float [[TMP15]] to double +// CHECK3-NEXT: [[ADD15:%.*]] = fadd double [[CONV14]], 1.000000e+00 +// CHECK3-NEXT: [[CONV16:%.*]] = fptrunc double [[ADD15]] to float +// CHECK3-NEXT: store float [[CONV16]], float* [[ARRAYIDX13]], align 4 +// CHECK3-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[TMP3]], i32 0, i32 1 +// CHECK3-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX17]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP16:%.*]] = load double, double* [[ARRAYIDX18]], align 8 +// CHECK3-NEXT: [[ADD19:%.*]] = fadd double [[TMP16]], 1.000000e+00 +// CHECK3-NEXT: store double [[ADD19]], double* [[ARRAYIDX18]], align 8 +// CHECK3-NEXT: [[TMP17:%.*]] = mul nsw i32 1, [[TMP5]] +// CHECK3-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 [[TMP17]] +// CHECK3-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX20]], i32 3 +// CHECK3-NEXT: [[TMP18:%.*]] = load double, double* [[ARRAYIDX21]], align 8 +// CHECK3-NEXT: [[ADD22:%.*]] = fadd double [[TMP18]], 1.000000e+00 +// CHECK3-NEXT: store double [[ADD22]], double* [[ARRAYIDX21]], align 8 +// CHECK3-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], %struct.TT* [[TMP7]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP19:%.*]] = load i64, i64* [[X]], align 8 +// CHECK3-NEXT: [[ADD23:%.*]] = add nsw i64 [[TMP19]], 1 +// CHECK3-NEXT: store i64 [[ADD23]], i64* [[X]], align 8 +// CHECK3-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[TMP7]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP20:%.*]] = load i8, i8* [[Y]], align 8 +// CHECK3-NEXT: [[CONV24:%.*]] = sext i8 [[TMP20]] to i32 +// CHECK3-NEXT: [[ADD25:%.*]] = add nsw i32 [[CONV24]], 1 +// CHECK3-NEXT: [[CONV26:%.*]] = trunc i32 [[ADD25]] to i8 +// CHECK3-NEXT: store i8 [[CONV26]], i8* [[Y]], align 8 +// CHECK3-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull dereferenceable(16) [[TMP7]], i32 0) #[[ATTR7:[0-9]+]] +// CHECK3-NEXT: [[TMP21:%.*]] = load i64, i64* [[CALL]], align 8 +// CHECK3-NEXT: [[ADD27:%.*]] = add nsw i64 [[TMP21]], 1 +// CHECK3-NEXT: store i64 [[ADD27]], i64* [[CALL]], align 8 +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_ZN2TTIxcEixEi +// CHECK3-SAME: (%struct.TT* nonnull dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR5:[0-9]+]] comdat align 2 { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.TT*, align 4 +// CHECK3-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store %struct.TT* [[THIS]], %struct.TT** [[THIS_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[I]], i32* [[I_ADDR]], align 4 +// CHECK3-NEXT: [[THIS1:%.*]] = load %struct.TT*, %struct.TT** [[THIS_ADDR]], align 4 +// CHECK3-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], %struct.TT* [[THIS1]], i32 0, i32 0 +// CHECK3-NEXT: ret i64* [[X]] +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_worker +// CHECK3-SAME: () #[[ATTR3]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90 +// CHECK3-SAME: (i32 [[A:%.*]], i32 [[AA:%.*]], i32 [[AAA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AAA_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[AAA]], i32* [[AAA_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i32* [[AAA_ADDR]] to i8* +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_worker() #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK3-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK3-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 +// CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK3-NEXT: [[CONV8:%.*]] = sext i16 [[TMP7]] to i32 +// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK3-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK3-NEXT: store i16 [[CONV10]], i16* [[CONV]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i8, i8* [[CONV1]], align 4 +// CHECK3-NEXT: [[CONV11:%.*]] = sext i8 [[TMP8]] to i32 +// CHECK3-NEXT: [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1 +// CHECK3-NEXT: [[CONV13:%.*]] = trunc i32 [[ADD12]] to i8 +// CHECK3-NEXT: store i8 [[CONV13]], i8* [[CONV1]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP9]], 1 +// CHECK3-NEXT: store i32 [[ADD14]], i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_worker +// CHECK3-SAME: () #[[ATTR3]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108 +// CHECK3-SAME: (%struct.S1* [[THIS:%.*]], i32 [[B:%.*]], i32 [[VLA:%.*]], i32 [[VLA1:%.*]], i16* nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[VLA_ADDR2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i16*, align 4 +// CHECK3-NEXT: store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[VLA]], i32* [[VLA_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4 +// CHECK3-NEXT: store i16* [[C]], i16** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP4:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_worker() #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP5:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 +// CHECK3-NEXT: [[TMP6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 +// CHECK3-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP8:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP8]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[B_ADDR]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP9]] to double +// CHECK3-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00 +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0 +// CHECK3-NEXT: store double [[ADD]], double* [[A]], align 8 +// CHECK3-NEXT: [[A9:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = load double, double* [[A9]], align 8 +// CHECK3-NEXT: [[INC:%.*]] = fadd double [[TMP10]], 1.000000e+00 +// CHECK3-NEXT: store double [[INC]], double* [[A9]], align 8 +// CHECK3-NEXT: [[CONV10:%.*]] = fptosi double [[INC]] to i16 +// CHECK3-NEXT: [[TMP11:%.*]] = mul nsw i32 1, [[TMP2]] +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 [[TMP11]] +// CHECK3-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1 +// CHECK3-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX11]], align 2 +// CHECK3-NEXT: [[A12:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP12:%.*]] = load double, double* [[A12]], align 8 +// CHECK3-NEXT: [[CONV13:%.*]] = fptosi double [[TMP12]] to i32 +// CHECK3-NEXT: [[A14:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0 +// CHECK3-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV13]], double* nonnull align 8 dereferenceable(8) [[A14]]) #[[ATTR7]] +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_Z3baziRd +// CHECK3-SAME: (i32 [[F3:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[F2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK3-NEXT: [[TMP1:%.*]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]]) +// CHECK3-NEXT: [[TMP2:%.*]] = icmp eq i16 [[TMP1]], 0 +// CHECK3-NEXT: [[TMP3:%.*]] = call i8 @__kmpc_is_spmd_exec_mode() #[[ATTR2]] +// CHECK3-NEXT: [[TMP4:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTSPMD:%.*]], label [[DOTNON_SPMD:%.*]] +// CHECK3: .spmd: +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .non-spmd: +// CHECK3-NEXT: [[TMP5:%.*]] = select i1 [[TMP2]], i32 4, i32 128 +// CHECK3-NEXT: [[TMP6:%.*]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i32 [[TMP5]], i16 0) +// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to %struct._globalized_locals_ty* +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: [[_SELECT_STACK:%.*]] = phi %struct._globalized_locals_ty* [ null, [[DOTSPMD]] ], [ [[TMP7]], [[DOTNON_SPMD]] ] +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to %struct._globalized_locals_ty.0* +// CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[_SELECT_STACK]], i32 0, i32 0 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID]], 31 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* [[F]], i32 0, i32 [[NVPTX_LANE_ID]] +// CHECK3-NEXT: [[F1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP8]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = select i1 [[TMP2]], i32* [[F1]], i32* [[TMP9]] +// CHECK3-NEXT: [[TMP11:%.*]] = select i1 [[TMP4]], i32* [[F2]], i32* [[TMP10]] +// CHECK3-NEXT: store i32 [[F3]], i32* [[TMP11]], align 4 +// CHECK3-NEXT: store double* [[A]], double** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load double*, double** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP11]] to i8* +// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP12]] to i8* +// CHECK3-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP17]], i32 2) +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP11]], align 4 +// CHECK3-NEXT: store i32 [[TMP18]], i32* [[RETVAL]], align 4 +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTEXIT5:%.*]], label [[DOTNON_SPMD4:%.*]] +// CHECK3: .non-spmd4: +// CHECK3-NEXT: [[TMP19:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to i8* +// CHECK3-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP19]]) +// CHECK3-NEXT: br label [[DOTEXIT5]] +// CHECK3: .exit5: +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[RETVAL]], align 4 +// CHECK3-NEXT: ret i32 [[TMP20]] +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[F:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca double*, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[F]], i32** [[F_ADDR]], align 4 +// CHECK3-NEXT: store double* [[A]], double** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[F_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 4 +// CHECK3-NEXT: store double* [[TMP1]], double** [[TMP]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load double*, double** [[TMP]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load double, double* [[TMP2]], align 8 +// CHECK3-NEXT: [[ADD:%.*]] = fadd double 2.000000e+00, [[TMP3]] +// CHECK3-NEXT: [[CONV:%.*]] = fptosi double [[ADD]] to i32 +// CHECK3-NEXT: store i32 [[CONV]], i32* [[TMP0]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR3]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 1 +// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to double** +// CHECK3-NEXT: [[TMP8:%.*]] = load double*, double** [[TMP7]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], double* [[TMP8]]) #[[ATTR2]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker +// CHECK3-SAME: () #[[ATTR3]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142 +// CHECK3-SAME: () #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker() #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK3-NEXT: call void @_Z6asserti(i32 0) #[[ATTR8:[0-9]+]] +// CHECK3-NEXT: unreachable +// CHECK3: 5: +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_worker +// CHECK3-SAME: () #[[ATTR3]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK3-SAME: (i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_worker() #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK3-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK3-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 +// CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK3-NEXT: [[CONV7:%.*]] = sext i16 [[TMP7]] to i32 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 +// CHECK3-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 +// CHECK3-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP8]], 1 +// CHECK3-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// diff --git a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp @@ -1,24 +1,21 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK1 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK3 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK4 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK5 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK6 // expected-no-diagnostics #ifndef HEADER #define HEADER -// Check that the execution mode of all 2 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0 - template tx ftemplate(int n) { tx a = 0; @@ -53,84 +50,681 @@ return a; } - // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}} - -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}( -// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align -// CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack -// CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align -// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align -// CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd -// CHECK: br label {{%?}}[[EXEC:.+]] -// -// CHECK: [[EXEC]] -// CHECK: {{call|invoke}} void [[OP1:@.+]]({{.+}}, {{.+}}, i16* [[AA]]) -// CHECK: br label {{%?}}[[DONE:.+]] -// -// CHECK: [[DONE]] -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) -// CHECK: br label {{%?}}[[EXIT:.+]] -// -// CHECK: [[EXIT]] -// CHECK: ret void -// CHECK: } - -// CHECK: define internal void [[OP1]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i16* {{[^%]*}}[[ARG:%.+]]) -// CHECK: = alloca i32*, align -// CHECK: = alloca i32*, align -// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align -// CHECK: store i16* [[ARG]], i16** [[AA_ADDR]], align -// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align -// CHECK: [[VAL:%.+]] = load i16, i16* [[AA]], align -// CHECK: store i16 {{%.+}}, i16* [[AA]], align -// CHECK: ret void -// CHECK: } - -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l38}}( -// CHECK: [[A_ADDR:%.+]] = alloca i32*, align -// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align -// CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align -// CHECK: store i32* {{%.+}}, i32** [[A_ADDR]], align -// CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align -// CHECK: store [10 x i32]* {{%.+}}, [10 x i32]** [[B_ADDR]], align -// CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align -// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align -// CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align -// CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd -// CHECK: br label {{%?}}[[EXEC:.+]] -// -// CHECK: [[EXEC]] -// CHECK: {{call|invoke}} void [[OP2:@.+]]({{.+}}, {{.+}}, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]]) -// CHECK: br label {{%?}}[[DONE:.+]] -// -// CHECK: [[DONE]] -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) -// CHECK: br label {{%?}}[[EXIT:.+]] -// -// CHECK: [[EXIT]] -// CHECK: ret void -// CHECK: } - -// CHECK: define internal void [[OP2]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* {{[^%]*}}[[ARG1:%.+]], i16* {{[^%]*}}[[ARG2:%.+]], [10 x i32]* {{[^%]*}}[[ARG3:%.+]]) -// CHECK: = alloca i32*, align -// CHECK: = alloca i32*, align -// CHECK: [[A_ADDR:%.+]] = alloca i32*, align -// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align -// CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align -// CHECK: store i32* [[ARG1]], i32** [[A_ADDR]], align -// CHECK: store i16* [[ARG2]], i16** [[AA_ADDR]], align -// CHECK: store [10 x i32]* [[ARG3]], [10 x i32]** [[B_ADDR]], align -// CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align -// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align -// CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align -// CHECK: store i32 {{%.+}}, i32* [[A]], align -// CHECK: store i16 {{%.+}}, i16* [[AA]], align -// CHECK: [[ELT:%.+]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], -// CHECK: store i32 {{%.+}}, i32* [[ELT]], align -// CHECK: ret void -// CHECK: } #endif +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30 +// CHECK1-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i64 1) +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2 +// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK1-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 +// CHECK1-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35 +// CHECK1-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8* +// CHECK1-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK1-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP10]], i64 3) +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 +// CHECK1-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK1-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK1-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16 +// CHECK1-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK1-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30 +// CHECK2-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK2-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i32 1) +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2 +// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK2-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 +// CHECK2-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35 +// CHECK2-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8* +// CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK2-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP10]], i32 3) +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 +// CHECK2-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK2-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK2-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16 +// CHECK2-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK2-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30 +// CHECK3-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i32 1) +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2 +// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK3-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 +// CHECK3-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35 +// CHECK3-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8* +// CHECK3-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK3-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP10]], i32 3) +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 +// CHECK3-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK3-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK3-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16 +// CHECK3-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK3-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30 +// CHECK4-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK4-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK4-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK4-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i64 1) +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2 +// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK4-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 +// CHECK4-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2 +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35 +// CHECK4-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 +// CHECK4-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 +// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK4-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8* +// CHECK4-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK4-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK4-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 8 +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP10]], i64 3) +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 +// CHECK4-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 +// CHECK4-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK4-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK4-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16 +// CHECK4-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i64 0, i64 2 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK4-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30 +// CHECK5-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK5-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i32 1) +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2 +// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK5-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 +// CHECK5-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2 +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35 +// CHECK5-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8* +// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK5-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP10]], i32 3) +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 +// CHECK5-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK5-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK5-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16 +// CHECK5-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK5-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30 +// CHECK6-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK6-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK6-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i32 1) +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2 +// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK6-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 +// CHECK6-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2 +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35 +// CHECK6-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8* +// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK6-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP10]], i32 3) +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 +// CHECK6-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK6-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK6-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16 +// CHECK6-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK6-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: ret void +// diff --git a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp @@ -1,24 +1,21 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK1 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK3 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK4 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK5 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK6 // expected-no-diagnostics #ifndef HEADER #define HEADER -// Check that the execution mode of all 2 target regions on the gpu is set to non-SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l28}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 0 - template tx ftemplate(int n) { tx a = 0; @@ -48,67 +45,701 @@ return a; } -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l28}}( -// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align -// CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align -// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align -// CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd() -// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}}) -// CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]], -// CHECK: call void [[OUTLINED:@.+]](i32* [[THREADID]], i32* %{{.+}}, i16* [[AA]]) -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) -// CHECK: ret void -// CHECK: } - -// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i16* {{[^%]*}}[[ARG:%.+]]) -// CHECK: = alloca i32*, align -// CHECK: = alloca i32*, align -// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align -// CHECK: store i16* [[ARG]], i16** [[AA_ADDR]], align -// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align -// CHECK: [[VAL:%.+]] = load i16, i16* [[AA]], align -// CHECK: store i16 {{%.+}}, i16* [[AA]], align -// CHECK: ret void -// CHECK: } - -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}( -// CHECK: [[A_ADDR:%.+]] = alloca i32*, align -// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align -// CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align -// CHECK: store i32* {{%.+}}, i32** [[A_ADDR]], align -// CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align -// CHECK: store [10 x i32]* {{%.+}}, [10 x i32]** [[B_ADDR]], align -// CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align -// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align -// CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align -// CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd() -// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}}) -// CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]], -// CHECK: call void [[OUTLINED:@.+]](i32* [[THREADID]], i32* %{{.+}}, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]]) -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) -// CHECK: ret void -// CHECK: } - -// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* {{[^%]*}}[[ARG1:%.+]], i16* {{[^%]*}}[[ARG2:%.+]], [10 x i32]* {{[^%]*}}[[ARG3:%.+]]) -// CHECK: = alloca i32*, align -// CHECK: = alloca i32*, align -// CHECK: [[A_ADDR:%.+]] = alloca i32*, align -// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align -// CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align -// CHECK: store i32* [[ARG1]], i32** [[A_ADDR]], align -// CHECK: store i16* [[ARG2]], i16** [[AA_ADDR]], align -// CHECK: store [10 x i32]* [[ARG3]], [10 x i32]** [[B_ADDR]], align -// CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align -// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align -// CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align -// CHECK: store i32 {{%.+}}, i32* [[A]], align -// CHECK: store i16 {{%.+}}, i16* [[AA]], align -// CHECK: [[ELT:%.+]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], -// CHECK: store i32 {{%.+}}, i32* [[ELT]], align -// CHECK: ret void -// CHECK: } #endif +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l25 +// CHECK1-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i64 1) +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2 +// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK1-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 +// CHECK1-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30 +// CHECK1-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i64 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], i64* [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP1]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP11]], i64 3) +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 +// CHECK1-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK1-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK1-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16 +// CHECK1-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK1-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l25 +// CHECK2-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK2-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i32 1) +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2 +// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK2-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 +// CHECK2-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30 +// CHECK2-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP1]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP11]], i32 3) +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 +// CHECK2-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK2-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK2-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16 +// CHECK2-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK2-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l25 +// CHECK3-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i32 1) +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2 +// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK3-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 +// CHECK3-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30 +// CHECK3-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP1]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP11]], i32 3) +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 +// CHECK3-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK3-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK3-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16 +// CHECK3-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK3-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l25 +// CHECK4-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK4-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK4-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK4-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i64 1) +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2 +// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK4-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 +// CHECK4-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2 +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30 +// CHECK4-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i64 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: store i64 [[DOTCAPTURE_EXPR_]], i64* [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 +// CHECK4-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32* +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 8 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP1]] to i8* +// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP11]], i64 3) +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 +// CHECK4-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 +// CHECK4-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK4-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK4-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16 +// CHECK4-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i64 0, i64 2 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK4-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l25 +// CHECK5-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK5-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i32 1) +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2 +// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK5-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 +// CHECK5-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2 +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30 +// CHECK5-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP1]] to i8* +// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP11]], i32 3) +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 +// CHECK5-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK5-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK5-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16 +// CHECK5-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK5-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l25 +// CHECK6-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK6-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK6-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i32 1) +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2 +// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK6-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 +// CHECK6-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2 +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30 +// CHECK6-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP1]] to i8* +// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP11]], i32 3) +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 +// CHECK6-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK6-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK6-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16 +// CHECK6-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK6-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: ret void +// diff --git a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp @@ -1,18 +1,14 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK1 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK3 // expected-no-diagnostics #ifndef HEADER #define HEADER -// Check that the execution mode of all 2 target regions is set to Generic Mode. -// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 0 - template tx ftemplate(int n) { tx a = 0; @@ -52,208 +48,861 @@ return a; } - // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l22}}_worker() - - - - - - - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l27}}_worker() - // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, - // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, - // CHECK: store i8* null, i8** [[OMP_WORK_FN]], - // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], - // CHECK: br label {{%?}}[[AWAIT_WORK:.+]] - // - // CHECK: [[AWAIT_WORK]] - // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) - // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 - // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 - // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null - // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] - // - // CHECK: [[SEL_WORKERS]] - // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]] - // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 - // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] - // - // CHECK: [[EXEC_PARALLEL]] - // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] - // - // CHECK: [[TERM_PARALLEL]] - // CHECK: call void @__kmpc_kernel_end_parallel() - // CHECK: br label {{%?}}[[BAR_PARALLEL]] - // - // CHECK: [[BAR_PARALLEL]] - // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) - // CHECK: br label {{%?}}[[AWAIT_WORK]] - // - // CHECK: [[EXIT]] - // CHECK: ret void - - // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l27]](i[[SZ:32|64]] [[A:%[^)]+]]) - // CHECK: store i[[SZ]] [[A]], i[[SZ]]* [[A_ADDR:%.+]], align - // CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[A_ADDR]] to i8* - - // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]] - // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] - // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] - // - // CHECK: [[WORKER]] - // CHECK: {{call|invoke}} void [[T1]]_worker() - // CHECK: br label {{%?}}[[EXIT:.+]] - // - // CHECK: [[CHECK_MASTER]] - // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], - // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] - // - // CHECK: [[MASTER]] - // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]] - // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] - // - // CHECK-NOT: kmpc_fork_teams - // CHECK: [[A_VAL:%.+]] = load i8, i8* [[CONV]], align - // CHECK: [[ACP:%.+]] = bitcast i[[SZ]]* [[AC:%.+]] to i8* - // CHECK: store i8 [[A_VAL]], i8* [[ACP]], align - // CHECK: [[ACV:%.+]] = load i[[SZ]], i[[SZ]]* [[AC]], align - // CHECK: call void [[PARALLEL:@.+]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] [[ACV]]) - // CHECK: br label {{%?}}[[TERMINATE:.+]] - // - // CHECK: [[TERMINATE]] - // CHECK: call void @__kmpc_kernel_deinit( - // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) - // CHECK: br label {{%?}}[[EXIT]] - // - // CHECK: [[EXIT]] - // CHECK: ret void - - // CHECK: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i[[SZ]] [[A_VAL:%.+]]) - // CHECK: [[A_ADDR:%.+]] = alloca i[[SZ]], - // CHECK: store i[[SZ]] [[A_VAL]], i[[SZ]]* [[A_ADDR]], - // CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[A_ADDR]] to i8* - // CHECK: store i8 49, i8* [[CONV]], - // CHECK: ret void - - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l32}}_worker() - // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, - // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, - // CHECK: store i8* null, i8** [[OMP_WORK_FN]], - // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], - // CHECK: br label {{%?}}[[AWAIT_WORK:.+]] - // - // CHECK: [[AWAIT_WORK]] - // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) - // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 - // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 - // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null - // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] - // - // CHECK: [[SEL_WORKERS]] - // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]] - // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 - // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] - // - // CHECK: [[EXEC_PARALLEL]] - // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] - // - // CHECK: [[TERM_PARALLEL]] - // CHECK: call void @__kmpc_kernel_end_parallel() - // CHECK: br label {{%?}}[[BAR_PARALLEL]] - // - // CHECK: [[BAR_PARALLEL]] - // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) - // CHECK: br label {{%?}}[[AWAIT_WORK]] - // - // CHECK: [[EXIT]] - // CHECK: ret void - - // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l32]](i[[SZ:32|64]] [[AA:%[^)]+]]) - // CHECK: store i[[SZ]] [[AA]], i[[SZ]]* [[AA_ADDR:%.+]], align - // CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16* - - // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]] - // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] - // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] - // - // CHECK: [[WORKER]] - // CHECK: {{call|invoke}} void [[T2]]_worker() - // CHECK: br label {{%?}}[[EXIT:.+]] - // - // CHECK: [[CHECK_MASTER]] - // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], - // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] - // - // CHECK: [[MASTER]] - // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]] - // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] - // - // CHECK-NOT: kmpc_fork_teams - // CHECK: [[AA_VAL:%.+]] = load i16, i16* [[CONV]], align - // CHECK: [[ACP:%.+]] = bitcast i[[SZ]]* [[AC:%.+]] to i16* - // CHECK: store i16 [[AA_VAL]], i16* [[ACP]], align - // CHECK: [[ACV:%.+]] = load i[[SZ]], i[[SZ]]* [[AC]], align - // CHECK: call void [[PARALLEL:@.+]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] [[ACV]]) - // CHECK: br label {{%?}}[[TERMINATE:.+]] - // - // CHECK: [[TERMINATE]] - // CHECK: call void @__kmpc_kernel_deinit( - // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) - // CHECK: br label {{%?}}[[EXIT]] - // - // CHECK: [[EXIT]] - // CHECK: ret void - - // CHECK: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i[[SZ]] [[A_VAL:%.+]]) - // CHECK: [[A_ADDR:%.+]] = alloca i[[SZ]], - // CHECK: store i[[SZ]] [[A_VAL]], i[[SZ]]* [[A_ADDR]], - // CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[A_ADDR]] to i16* - // CHECK: store i16 1, i16* [[CONV]], - // CHECK: ret void - -// CHECK: define weak void @__omp_offloading_{{.*}}ftemplate{{.*}}_l37( -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd -// CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack( -// CHECK-NOT: call void @__kmpc_serialized_parallel( -// CHECK: call void [[L0:@.+]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.+}}) -// CHECK-NOT: call void @__kmpc_end_serialized_parallel( -// CHECK-NOT: call void @__kmpc_data_sharing_pop_stack( -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) -// CHECK: ret - -// CHECK: define internal void [[L0]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i[[SZ]] %{{.+}}) -// CHECK: call void [[L1:@.+]](i32* %{{.+}}, i32* %{{.+}}, i16* %{{.+}}) -// CHECK: ret void - -// CHECK: define internal void [[L1]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i16* nonnull align {{[0-9]+}} dereferenceable -// CHECK: call void @__kmpc_serialized_parallel( -// CHECK: call void [[L2:@.+]](i32* %{{.+}}, i32* %{{.+}}, i16* %{{.+}}) -// CHECK: call void @__kmpc_end_serialized_parallel( -// CHECK: ret void - -// CHECK: define internal void [[L2]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i16* nonnull align {{[0-9]+}} dereferenceable -// CHECK: store i16 1, i16* % -// CHECK: ret void - #endif +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker +// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 +// CHECK1-SAME: (i64 [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i8* +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP6:%.*]] = load i8, i8* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV7:%.*]] = bitcast i64* [[A_CASTED]] to i8* +// CHECK1-NEXT: store i8 [[TMP6]], i8* [[CONV7]], align 1 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[A_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP7]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[A:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i8* +// CHECK1-NEXT: store i8 49, i8* [[CONV]], align 8 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l28_worker +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l28 +// CHECK1-SAME: (i64 [[AA:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16* +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l28_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV7:%.*]] = bitcast i64* [[AA_CASTED]] to i16* +// CHECK1-NEXT: store i16 [[TMP6]], i16* [[CONV7]], align 2 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[AA_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP7]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[AA:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16* +// CHECK1-NEXT: store i16 1, i16* [[CONV]], align 8 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33 +// CHECK1-SAME: (i64 [[AA:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16* +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK1-NEXT: [[TMP1:%.*]] = load i16, i16* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[AA_CASTED]] to i16* +// CHECK1-NEXT: store i16 [[TMP1]], i16* [[CONV1]], align 2 +// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[AA_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP2]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[AA:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16* +// CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP1:%.*]] = bitcast i16* [[CONV]] to i8* +// CHECK1-NEXT: store i8* [[TMP1]], i8** [[TMP0]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP4]], i64 1) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP2:%.*]] = bitcast i16* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[TMP1]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__4 to i8*), i8* null, i8** [[TMP5]], i64 1) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 +// CHECK1-NEXT: store i16 1, i16* [[TMP0]], align 2 +// CHECK1-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker +// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 +// CHECK2-SAME: (i32 [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[A_ADDR]] to i8* +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP6:%.*]] = load i8, i8* [[CONV]], align 4 +// CHECK2-NEXT: [[CONV7:%.*]] = bitcast i32* [[A_CASTED]] to i8* +// CHECK2-NEXT: store i8 [[TMP6]], i8* [[CONV7]], align 1 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[A_CASTED]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP7]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[A:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[A_ADDR]] to i8* +// CHECK2-NEXT: store i8 49, i8* [[CONV]], align 4 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l28_worker +// CHECK2-SAME: () #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l28 +// CHECK2-SAME: (i32 [[AA:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[AA_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l28_worker() #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK2-NEXT: [[CONV7:%.*]] = bitcast i32* [[AA_CASTED]] to i16* +// CHECK2-NEXT: store i16 [[TMP6]], i16* [[CONV7]], align 2 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[AA_CASTED]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP7]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[AA:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK2-NEXT: store i16 1, i16* [[CONV]], align 4 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33 +// CHECK2-SAME: (i32 [[AA:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[AA_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK2-NEXT: [[TMP1:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i32* [[AA_CASTED]] to i16* +// CHECK2-NEXT: store i16 [[TMP1]], i16* [[CONV1]], align 2 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[AA_CASTED]], align 4 +// CHECK2-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP2]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[AA:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK2-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP1:%.*]] = bitcast i16* [[CONV]] to i8* +// CHECK2-NEXT: store i8* [[TMP1]], i8** [[TMP0]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP4]], i32 1) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP2:%.*]] = bitcast i16* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[TMP1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__4 to i8*), i8* null, i8** [[TMP5]], i32 1) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK2-NEXT: store i16 1, i16* [[TMP0]], align 2 +// CHECK2-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker +// CHECK3-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 +// CHECK3-SAME: (i32 [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[A_ADDR]] to i8* +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP6:%.*]] = load i8, i8* [[CONV]], align 4 +// CHECK3-NEXT: [[CONV7:%.*]] = bitcast i32* [[A_CASTED]] to i8* +// CHECK3-NEXT: store i8 [[TMP6]], i8* [[CONV7]], align 1 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[A_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP7]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[A:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[A_ADDR]] to i8* +// CHECK3-NEXT: store i8 49, i8* [[CONV]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l28_worker +// CHECK3-SAME: () #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l28 +// CHECK3-SAME: (i32 [[AA:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l28_worker() #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK3-NEXT: [[CONV7:%.*]] = bitcast i32* [[AA_CASTED]] to i16* +// CHECK3-NEXT: store i16 [[TMP6]], i16* [[CONV7]], align 2 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[AA_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP7]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[AA:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK3-NEXT: store i16 1, i16* [[CONV]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33 +// CHECK3-SAME: (i32 [[AA:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK3-NEXT: [[TMP1:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i32* [[AA_CASTED]] to i16* +// CHECK3-NEXT: store i16 [[TMP1]], i16* [[CONV1]], align 2 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[AA_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP2]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[AA:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP1:%.*]] = bitcast i16* [[CONV]] to i8* +// CHECK3-NEXT: store i8* [[TMP1]], i8** [[TMP0]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP4]], i32 1) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP2:%.*]] = bitcast i16* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[TMP1]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__4 to i8*), i8* null, i8** [[TMP5]], i32 1) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 +// CHECK3-NEXT: store i16 1, i16* [[TMP0]], align 2 +// CHECK3-NEXT: ret void +// diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp @@ -1,21 +1,17 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK1 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK2 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK3 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK4 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK5 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK6 // expected-no-diagnostics #ifndef HEADER #define HEADER -// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] } -// SEQ-DAG: {{@__omp_offloading_.+}}_l23_exec_mode = weak constant i8 1 -// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 - template tx ftemplate(int n) { int i; @@ -38,64 +34,1152 @@ return a; } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l23}}_worker() - // CHECK: ret void - - // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l23}}() - - // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]] - // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] - // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] - // - // CHECK: [[WORKER]] - // CHECK: {{call|invoke}} void {{@__omp_offloading_.+template.+l23}}_worker() - // CHECK: br label {{%?}}[[EXIT:.+]] - // - // CHECK: [[CHECK_MASTER]] - // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], - // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] - // - // CHECK: [[MASTER]] - // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]] - // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] - // CHECK: call void [[PARALLEL:@.+]](i32* %{{.+}}, i32* %{{.+}}) - // CHECK: br label {{%?}}[[TERMINATE:.+]] - // - // CHECK: [[TERMINATE]] - // CHECK: call void @__kmpc_kernel_deinit( - // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) - // CHECK: br label {{%?}}[[EXIT]] - // - // CHECK: [[EXIT]] - // CHECK: ret void - - // CHECK: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias %{{.+}}) - // SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], - // SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], - // SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* @{{.+}}, i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[BUF:@.+]] to i8**)) - // SEQ: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[BUF]], - // SEQ: [[ADDR:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0 - // PAR: [[ADDR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) - // CHECK: [[RD:%.+]] = bitcast i8* [[ADDR]] to [[GLOB_TY:%.+]]* - // CHECK: [[I_ADDR:%.+]] = getelementptr inbounds [[GLOB_TY]], [[GLOB_TY]]* [[RD]], i32 0, i32 0 - // - // CHECK: call void @__kmpc_for_static_init_4( - // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @{{.+}} to i8*)) - // CHECK: call void @__kmpc_begin_sharing_variables(i8*** [[SHARED_VARS_PTR:%.+]], i{{64|32}} 1) - // CHECK: [[SHARED_VARS_BUF:%.+]] = load i8**, i8*** [[SHARED_VARS_PTR]], - // CHECK: [[VARS_BUF:%.+]] = getelementptr inbounds i8*, i8** [[SHARED_VARS_BUF]], i{{64|32}} 0 - // CHECK: [[I_ADDR_BC:%.+]] = bitcast i32* [[I_ADDR]] to i8* - // CHECK: store i8* [[I_ADDR_BC]], i8** [[VARS_BUF]], - // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) - // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) - // CHECK: call void @__kmpc_end_sharing_variables() - // CHECK: call void @__kmpc_for_static_fini( #endif +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker +// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK1: .execute.fn: +// CHECK1-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .check.next: +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 +// CHECK1-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 +// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP1]], i16 [[TMP0]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 0 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* +// CHECK1-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 9 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP8]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]] +// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[I]] to i8* +// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP15]], i64 1) +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK1-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]]) +// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK1-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker +// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK2: .execute.fn: +// CHECK2-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .check.next: +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 +// CHECK2-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) +// CHECK2-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to %struct._globalized_locals_ty* +// CHECK2-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP1]], i32 0, i32 0 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] +// CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast i32* [[I]] to i8* +// CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP12]], i64 1) +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK2-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) +// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP0]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK2-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker +// CHECK3-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK3-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK3: .execute.fn: +// CHECK3-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .check.next: +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 +// CHECK3-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP1]], i16 [[TMP0]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* +// CHECK3-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 9 +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP8]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]] +// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[I]] to i8* +// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP15]], i32 1) +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]]) +// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK3-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK3-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker +// CHECK4-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK4: .await.work: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK4: .select.workers: +// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK4: .execute.parallel: +// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK4-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK4: .execute.fn: +// CHECK4-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK4: .check.next: +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK4-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK4: .terminate.parallel: +// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK4: .barrier.parallel: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 +// CHECK4-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK4: .worker: +// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .mastercheck: +// CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK4: .master: +// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK4-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK4: .termination.notifier: +// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTEXIT]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP1]], i16 [[TMP0]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* +// CHECK4-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 9 +// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP8]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]] +// CHECK4-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i32* [[I]] to i8* +// CHECK4-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP15]], i32 1) +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK4-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]]) +// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]]) +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK4-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK4-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK4-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker +// CHECK5-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK5: .await.work: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK5: .select.workers: +// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK5: .execute.parallel: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK5: .execute.fn: +// CHECK5-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK5: .check.next: +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK5-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK5: .terminate.parallel: +// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK5: .barrier.parallel: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 +// CHECK5-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK5: .worker: +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .mastercheck: +// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK5: .master: +// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK5-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK5: .termination.notifier: +// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTEXIT]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK5-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to %struct._globalized_locals_ty* +// CHECK5-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP1]], i32 0, i32 0 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 +// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] +// CHECK5-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast i32* [[I]] to i8* +// CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP12]], i32 1) +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK5-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) +// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP0]]) +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK5-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker +// CHECK6-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK6: .await.work: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK6: .select.workers: +// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK6: .execute.parallel: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK6-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK6: .execute.fn: +// CHECK6-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK6: .check.next: +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK6-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK6: .terminate.parallel: +// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK6: .barrier.parallel: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 +// CHECK6-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK6: .worker: +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .mastercheck: +// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK6: .master: +// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK6-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK6: .termination.notifier: +// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTEXIT]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK6-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to %struct._globalized_locals_ty* +// CHECK6-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP1]], i32 0, i32 0 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 +// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] +// CHECK6-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast i32* [[I]] to i8* +// CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP12]], i32 1) +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK6-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) +// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP0]]) +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK6-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -1,38 +1,31 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" --prefix-filecheck-ir-name _ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK1 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK3 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK4 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK5 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK6 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK7 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK8 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK9 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK10 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK11 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK12 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK13 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK14 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK15 // expected-no-diagnostics #ifndef HEADER #define HEADER -// Check that the execution mode of all 5 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l56}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l61}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l66}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l74}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l81}}_exec_mode = weak constant i8 0 - #define N 1000 #define M 10 @@ -92,173 +85,25263 @@ return a; } -// SEQ-DAG: [[MEM_TY:%.+]] = type { [128 x i8] } -// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef -// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef -// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 - -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l50( -// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0) -// CHECK: call void [[PARALLEL:@.+]]( -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) - -// CHECK: define internal void [[PARALLEL]]( -// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], -// SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ: [[ADDR:%.+]] = getelementptr inbounds i8, i8* [[TEAM_ALLOC]], i{{64|32}} 0 -// PAR: [[ADDR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) -// CHECK: [[BC:%.+]] = bitcast i8* [[ADDR]] to [[REC:%.+]]* -// CHECK: getelementptr inbounds [[REC]], [[REC]]* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, -// CHECK: {{call|invoke}} void [[OUTL1:@.+]]( -// CHECK: call void @__kmpc_for_static_fini( -// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]]) -// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[ADDR]]) -// CHECK: ret void - -// CHECK: define internal void [[OUTL1]]( -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, -// CHECK: call void @__kmpc_for_static_fini( -// CHECK: ret void - -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( -// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0) -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK: ret void - -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, -// CHECK: {{call|invoke}} void [[OUTL2:@.+]]( -// CHECK: call void @__kmpc_for_static_fini( - -// CHECK: define internal void [[OUTL2]]( -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, -// CHECK: call void @__kmpc_for_static_fini( -// CHECK: ret void - -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( -// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0) -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK: ret void - -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, -// CHECK: {{call|invoke}} void [[OUTL3:@.+]]( -// CHECK: call void @__kmpc_for_static_fini( - -// CHECK: define internal void [[OUTL3]]( -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, -// CHECK: call void @__kmpc_for_static_fini( -// CHECK: ret void - -// Distribute with collapse(2) -// CHECK: define {{.*}}void {{@__omp_offloading_.+}}({{.+}}, i{{32|64}} [[F_IN:%.+]]) -// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0) -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK: ret void - -// CHECK: alloca -// CHECK: alloca -// CHECK: alloca -// CHECK: alloca -// CHECK: [[OMP_IV:%.+]] = alloca -// CHECK: alloca -// CHECK: alloca -// CHECK: [[OMP_LB:%.+]] = alloca -// CHECK: [[OMP_UB:%.+]] = alloca -// CHECK: [[OMP_ST:%.+]] = alloca -// CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}}, -// CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]], - -// check EUB for distribute -// CHECK-DAG: [[OMP_UB_VAL_1:%.+]] = load{{.+}} [[OMP_UB]], -// CHECK-DAG: [[CMP_UB_NUM_IT:%.+]] = icmp sgt {{.+}} [[OMP_UB_VAL_1]], 99 -// CHECK: br {{.+}} [[CMP_UB_NUM_IT]], label %[[EUB_TRUE:.+]], label %[[EUB_FALSE:.+]] -// CHECK-DAG: [[EUB_TRUE]]: -// CHECK: br label %[[EUB_END:.+]] -// CHECK-DAG: [[EUB_FALSE]]: -// CHECK: [[OMP_UB_VAL2:%.+]] = load{{.+}} [[OMP_UB]], -// CHECK: br label %[[EUB_END]] -// CHECK-DAG: [[EUB_END]]: -// CHECK-DAG: [[EUB_RES:%.+]] = phi{{.+}} [ 99, %[[EUB_TRUE]] ], [ [[OMP_UB_VAL2]], %[[EUB_FALSE]] ] -// CHECK: store{{.+}} [[EUB_RES]], {{.+}}* [[OMP_UB]], - -// initialize omp.iv -// CHECK: [[OMP_LB_VAL_1:%.+]] = load{{.+}}, {{.+}}* [[OMP_LB]], -// CHECK: store {{.+}} [[OMP_LB_VAL_1]], {{.+}}* [[OMP_IV]], - -// check exit condition -// CHECK-DAG: [[OMP_IV_VAL_1:%.+]] = load {{.+}} [[OMP_IV]], -// CHECK: [[CMP_IV_UB:%.+]] = icmp slt {{.+}} [[OMP_IV_VAL_1]], 100 -// CHECK: br {{.+}} [[CMP_IV_UB]], label %[[DIST_INNER_LOOP_BODY:.+]], label %[[DIST_INNER_LOOP_END:.+]] - -// check that PrevLB and PrevUB are passed to the 'for' -// CHECK: [[DIST_INNER_LOOP_BODY]]: -// CHECK-DAG: [[OMP_PREV_LB:%.+]] = load {{.+}}, {{.+}} [[OMP_LB]], -// CHECK-64-DAG: [[OMP_PREV_LB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_LB]] to {{.+}} -// CHECK-DAG: [[OMP_PREV_UB:%.+]] = load {{.+}}, {{.+}} [[OMP_UB]], -// CHECK-64-DAG: [[OMP_PREV_UB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_UB]] to {{.+}} - -// check that distlb and distub are properly passed to the outlined function -// CHECK-32: {{call|invoke}} void [[OUTL4:@.+]]({{.*}} i32 [[OMP_PREV_LB]], i32 [[OMP_PREV_UB]] -// CHECK-64: {{call|invoke}} void [[OUTL4:@.+]]({{.*}} i64 [[OMP_PREV_LB_EXT]], i64 [[OMP_PREV_UB_EXT]] - -// check DistInc -// CHECK-DAG: [[OMP_IV_VAL_3:%.+]] = load {{.+}}, {{.+}}* [[OMP_IV]], -// CHECK-DAG: [[OMP_ST_VAL_1:%.+]] = load {{.+}}, {{.+}}* [[OMP_ST]], -// CHECK: [[OMP_IV_INC:%.+]] = add{{.+}} [[OMP_IV_VAL_3]], [[OMP_ST_VAL_1]] -// CHECK: store{{.+}} [[OMP_IV_INC]], {{.+}}* [[OMP_IV]], -// CHECK-DAG: [[OMP_LB_VAL_2:%.+]] = load{{.+}}, {{.+}} [[OMP_LB]], -// CHECK-DAG: [[OMP_ST_VAL_2:%.+]] = load{{.+}}, {{.+}} [[OMP_ST]], -// CHECK-DAG: [[OMP_LB_NEXT:%.+]] = add{{.+}} [[OMP_LB_VAL_2]], [[OMP_ST_VAL_2]] -// CHECK: store{{.+}} [[OMP_LB_NEXT]], {{.+}}* [[OMP_LB]], -// CHECK-DAG: [[OMP_UB_VAL_5:%.+]] = load{{.+}}, {{.+}} [[OMP_UB]], -// CHECK-DAG: [[OMP_ST_VAL_3:%.+]] = load{{.+}}, {{.+}} [[OMP_ST]], -// CHECK-DAG: [[OMP_UB_NEXT:%.+]] = add{{.+}} [[OMP_UB_VAL_5]], [[OMP_ST_VAL_3]] -// CHECK: store{{.+}} [[OMP_UB_NEXT]], {{.+}}* [[OMP_UB]], - -// Update UB -// CHECK-DAG: [[OMP_UB_VAL_6:%.+]] = load{{.+}}, {{.+}} [[OMP_UB]], -// CHECK-DAG: [[CMP_UB_NUM_IT_1:%.+]] = icmp sgt {{.+}}[[OMP_UB_VAL_6]], 99 -// CHECK: br {{.+}} [[CMP_UB_NUM_IT_1]], label %[[EUB_TRUE_1:.+]], label %[[EUB_FALSE_1:.+]] -// CHECK-DAG: [[EUB_TRUE_1]]: -// CHECK: br label %[[EUB_END_1:.+]] -// CHECK-DAG: [[EUB_FALSE_1]]: -// CHECK: [[OMP_UB_VAL3:%.+]] = load{{.+}} [[OMP_UB]], -// CHECK: br label %[[EUB_END_1]] -// CHECK-DAG: [[EUB_END_1]]: -// CHECK-DAG: [[EUB_RES_1:%.+]] = phi{{.+}} [ 99, %[[EUB_TRUE_1]] ], [ [[OMP_UB_VAL3]], %[[EUB_FALSE_1]] ] -// CHECK: store{{.+}} [[EUB_RES_1]], {{.+}}* [[OMP_UB]], - -// Store LB in IV -// CHECK-DAG: [[OMP_LB_VAL_3:%.+]] = load{{.+}}, {{.+}} [[OMP_LB]], -// CHECK: store{{.+}} [[OMP_LB_VAL_3]], {{.+}}* [[OMP_IV]], - -// CHECK: [[DIST_INNER_LOOP_END]]: -// CHECK: call void @__kmpc_for_static_fini( - - -// CHECK-32: define internal void [[OUTL4]]( -// CHECK-64: define internal void [[OUTL4]]( -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, -// CHECK: call void @__kmpc_for_static_fini( -// CHECK: ret void - -// CHECK: define weak void @__omp_offloading_{{.*}}_l74(i[[SZ:64|32]] %{{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{.*}}) -// CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [10 x [10 x i32]]* %{{.*}}) -// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{.*}}) -// CHECK-DIV64: div i64 -// CHECK-DIV32-NO: div i64 - -// CHECK: define weak void @__omp_offloading_{{.*}}_l81(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* nonnull align {{[0-9]+}} dereferenceable{{.*}}, i32* %{{[^)]+}}) -// CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [1000 x i32]* %{{.*}}, i32* %{{.*}}) -// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [1000 x i32]* nonnull align {{[0-9]+}} dereferenceable{{.*}}, i32* %{{.*}}) - #endif +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK: .execute: +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK: .omp.deinit: +// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK-NEXT: br label [[DOTEXIT:%.*]] +// CHECK: .exit: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK-NEXT: [[TMP2:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 +// CHECK-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i64 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK: omp.precond.then: +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK: cond.true: +// CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK-NEXT: br label [[COND_END:%.*]] +// CHECK: cond.false: +// CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: br label [[COND_END]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +// CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 +// CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK-NEXT: store i32 [[TMP23]], i32* [[CONV8]], align 4 +// CHECK-NEXT: [[TMP24:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK-NEXT: store i32 [[TMP25]], i32* [[CONV9]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK-NEXT: [[TMP30:%.*]] = inttoptr i64 [[TMP22]] to i8* +// CHECK-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP24]] to i8* +// CHECK-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 +// CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK-NEXT: [[TMP34:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 +// CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK-NEXT: [[TMP36:%.*]] = inttoptr i64 [[TMP26]] to i8* +// CHECK-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 +// CHECK-NEXT: [[TMP37:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 +// CHECK-NEXT: [[TMP39:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP38]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP39]], i64 5) +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP46]], [[TMP47]] +// CHECK-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK: cond.true14: +// CHECK-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK-NEXT: br label [[COND_END16:%.*]] +// CHECK: cond.false15: +// CHECK-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: br label [[COND_END16]] +// CHECK: cond.end16: +// CHECK-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP48]], [[COND_TRUE14]] ], [ [[TMP49]], [[COND_FALSE15]] ] +// CHECK-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP50]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: +// CHECK-NEXT: [[TMP51:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) +// CHECK-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 +// CHECK-NEXT: br i1 [[TMP54]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK: .omp.lastprivate.then: +// CHECK-NEXT: [[TMP55:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK-NEXT: store i32 [[TMP55]], i32* [[CONV1]], align 8 +// CHECK-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK: .omp.lastprivate.done: +// CHECK-NEXT: br label [[OMP_PRECOND_END]] +// CHECK: omp.precond.end: +// CHECK-NEXT: [[TMP56:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP56]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK: omp.precond.then: +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK: omp.dispatch.cond: +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK: cond.true: +// CHECK-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK-NEXT: br label [[COND_END:%.*]] +// CHECK: cond.false: +// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK-NEXT: br label [[COND_END]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK: omp.dispatch.body: +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK: omp.dispatch.inc: +// CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK: omp.dispatch.end: +// CHECK-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK: .omp.lastprivate.then: +// CHECK-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 +// CHECK-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK: .omp.lastprivate.done: +// CHECK-NEXT: br label [[OMP_PRECOND_END]] +// CHECK: omp.precond.end: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK: .execute: +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK: .omp.deinit: +// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK-NEXT: br label [[DOTEXIT:%.*]] +// CHECK: .exit: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK: omp.precond.then: +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK: cond.true: +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK-NEXT: br label [[COND_END:%.*]] +// CHECK: cond.false: +// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: br label [[COND_END]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK: cond.true11: +// CHECK-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK-NEXT: br label [[COND_END13:%.*]] +// CHECK: cond.false12: +// CHECK-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: br label [[COND_END13]] +// CHECK: cond.end13: +// CHECK-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: +// CHECK-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK-NEXT: br label [[OMP_PRECOND_END]] +// CHECK: omp.precond.end: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK: omp.precond.then: +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: +// CHECK-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK-NEXT: br label [[OMP_PRECOND_END]] +// CHECK: omp.precond.end: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK: .execute: +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK: .omp.deinit: +// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK-NEXT: br label [[DOTEXIT:%.*]] +// CHECK: .exit: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK: cond.true: +// CHECK-NEXT: br label [[COND_END:%.*]] +// CHECK: cond.false: +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: br label [[COND_END]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK: cond.true5: +// CHECK-NEXT: br label [[COND_END7:%.*]] +// CHECK: cond.false6: +// CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: br label [[COND_END7]] +// CHECK: cond.end7: +// CHECK-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK: .execute: +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK: .omp.deinit: +// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK-NEXT: br label [[DOTEXIT:%.*]] +// CHECK: .exit: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK: cond.true: +// CHECK-NEXT: br label [[COND_END:%.*]] +// CHECK: cond.false: +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: br label [[COND_END]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK: cond.true7: +// CHECK-NEXT: br label [[COND_END9:%.*]] +// CHECK: cond.false8: +// CHECK-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: br label [[COND_END9]] +// CHECK: cond.end9: +// CHECK-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK: .execute: +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK: .omp.deinit: +// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK-NEXT: br label [[DOTEXIT:%.*]] +// CHECK: .exit: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 +// CHECK-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] +// CHECK-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK: land.lhs.true: +// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK: omp.precond.then: +// CHECK-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-NEXT: [[CONV12:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV12]]) +// CHECK-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK-NEXT: [[CMP13:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK-NEXT: br i1 [[CMP13]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK: cond.true: +// CHECK-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK-NEXT: br label [[COND_END:%.*]] +// CHECK: cond.false: +// CHECK-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK-NEXT: br label [[COND_END]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK-NEXT: [[CMP14:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: [[CONV15:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK-NEXT: store i32 [[TMP19]], i32* [[CONV15]], align 4 +// CHECK-NEXT: [[TMP20:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* +// CHECK-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK-NEXT: [[TMP28:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK-NEXT: [[TMP31:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP31]], i64 4) +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP32:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK-NEXT: [[TMP33:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP32]], [[TMP33]] +// CHECK-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_IV]], align 8 +// CHECK-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK-NEXT: [[ADD17:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK-NEXT: store i64 [[ADD17]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK-NEXT: [[ADD18:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK-NEXT: store i64 [[ADD18]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK-NEXT: [[CMP19:%.*]] = icmp sgt i64 [[TMP38]], [[TMP39]] +// CHECK-NEXT: br i1 [[CMP19]], label [[COND_TRUE20:%.*]], label [[COND_FALSE21:%.*]] +// CHECK: cond.true20: +// CHECK-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK-NEXT: br label [[COND_END22:%.*]] +// CHECK: cond.false21: +// CHECK-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK-NEXT: br label [[COND_END22]] +// CHECK: cond.end22: +// CHECK-NEXT: [[COND23:%.*]] = phi i64 [ [[TMP40]], [[COND_TRUE20]] ], [ [[TMP41]], [[COND_FALSE21]] ] +// CHECK-NEXT: store i64 [[COND23]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK-NEXT: store i64 [[TMP42]], i64* [[DOTOMP_IV]], align 8 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: +// CHECK-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK-NEXT: br label [[OMP_PRECOND_END]] +// CHECK: omp.precond.end: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 +// CHECK-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] +// CHECK-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK: land.lhs.true: +// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK: omp.precond.then: +// CHECK-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK-NEXT: store i64 [[TMP8]], i64* [[DOTOMP_LB]], align 8 +// CHECK-NEXT: store i64 [[TMP9]], i64* [[DOTOMP_UB]], align 8 +// CHECK-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK-NEXT: [[CMP12:%.*]] = icmp ule i64 [[TMP13]], [[TMP14]] +// CHECK-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: [[SUB13:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK-NEXT: [[DIV14:%.*]] = sdiv i32 [[SUB13]], 1 +// CHECK-NEXT: [[MUL15:%.*]] = mul nsw i32 1, [[DIV14]] +// CHECK-NEXT: [[CONV16:%.*]] = sext i32 [[MUL15]] to i64 +// CHECK-NEXT: [[DIV17:%.*]] = sdiv i64 [[TMP15]], [[CONV16]] +// CHECK-NEXT: [[MUL18:%.*]] = mul nsw i64 [[DIV17]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL18]] +// CHECK-NEXT: [[CONV19:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK-NEXT: store i32 [[CONV19]], i32* [[I10]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: [[SUB20:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK-NEXT: [[DIV21:%.*]] = sdiv i32 [[SUB20]], 1 +// CHECK-NEXT: [[MUL22:%.*]] = mul nsw i32 1, [[DIV21]] +// CHECK-NEXT: [[CONV23:%.*]] = sext i32 [[MUL22]] to i64 +// CHECK-NEXT: [[DIV24:%.*]] = sdiv i64 [[TMP18]], [[CONV23]] +// CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: [[SUB25:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK-NEXT: [[DIV26:%.*]] = sdiv i32 [[SUB25]], 1 +// CHECK-NEXT: [[MUL27:%.*]] = mul nsw i32 1, [[DIV26]] +// CHECK-NEXT: [[CONV28:%.*]] = sext i32 [[MUL27]] to i64 +// CHECK-NEXT: [[MUL29:%.*]] = mul nsw i64 [[DIV24]], [[CONV28]] +// CHECK-NEXT: [[SUB30:%.*]] = sub nsw i64 [[TMP17]], [[MUL29]] +// CHECK-NEXT: [[MUL31:%.*]] = mul nsw i64 [[SUB30]], 1 +// CHECK-NEXT: [[ADD32:%.*]] = add nsw i64 0, [[MUL31]] +// CHECK-NEXT: [[CONV33:%.*]] = trunc i64 [[ADD32]] to i32 +// CHECK-NEXT: store i32 [[CONV33]], i32* [[J11]], align 4 +// CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK-NEXT: [[ADD34:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK-NEXT: [[IDXPROM35:%.*]] = sext i32 [[TMP24]] to i64 +// CHECK-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM35]] +// CHECK-NEXT: store i32 [[ADD34]], i32* [[ARRAYIDX36]], align 4 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK-NEXT: [[ADD37:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK-NEXT: store i64 [[ADD37]], i64* [[DOTOMP_IV]], align 8 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: +// CHECK-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK-NEXT: br label [[OMP_PRECOND_END]] +// CHECK: omp.precond.end: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK: .execute: +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK: .omp.deinit: +// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK-NEXT: br label [[DOTEXIT:%.*]] +// CHECK: .exit: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK: omp.precond.then: +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK: cond.true: +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK-NEXT: br label [[COND_END:%.*]] +// CHECK: cond.false: +// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: br label [[COND_END]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* +// CHECK-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK: cond.true11: +// CHECK-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK-NEXT: br label [[COND_END13:%.*]] +// CHECK: cond.false12: +// CHECK-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: br label [[COND_END13]] +// CHECK: cond.end13: +// CHECK-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] +// CHECK-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: +// CHECK-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK-NEXT: br label [[OMP_PRECOND_END]] +// CHECK: omp.precond.end: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK: omp.precond.then: +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] +// CHECK-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: +// CHECK-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK-NEXT: br label [[OMP_PRECOND_END]] +// CHECK: omp.precond.end: +// CHECK-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK1-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) +// CHECK1-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK1-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK1-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP20]], i32* [[CONV8]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP22]], i32* [[CONV9]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK1-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK1-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP21]] to i8* +// CHECK1-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP31:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK1-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP23]] to i8* +// CHECK1-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 8 +// CHECK1-NEXT: [[TMP34:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[TMP34]], align 4 +// CHECK1-NEXT: [[TMP36:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP35]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP36]], i64 5) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK1-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP41]], [[TMP42]] +// CHECK1-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP43]], [[TMP44]] +// CHECK1-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK1: cond.true14: +// CHECK1-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: br label [[COND_END16:%.*]] +// CHECK1: cond.false15: +// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END16]] +// CHECK1: cond.end16: +// CHECK1-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP45]], [[COND_TRUE14]] ], [ [[TMP46]], [[COND_FALSE15]] ] +// CHECK1-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP47]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP48:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP49:%.*]] = load i32, i32* [[TMP48]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP49]]) +// CHECK1-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0 +// CHECK1-NEXT: br i1 [[TMP51]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK1: .omp.lastprivate.then: +// CHECK1-NEXT: [[TMP52:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: store i32 [[TMP52]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK1: .omp.lastprivate.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK1: omp.dispatch.cond: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK1-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK1-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK1-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK1: omp.dispatch.body: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK1-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK1-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK1-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK1: omp.dispatch.inc: +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK1-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK1: omp.dispatch.end: +// CHECK1-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK1-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK1: .omp.lastprivate.then: +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK1: .omp.lastprivate.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK1-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK1-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK1-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK1-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK1-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK1-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK1-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK1-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK1: cond.true11: +// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END13:%.*]] +// CHECK1: cond.false12: +// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END13]] +// CHECK1: cond.end13: +// CHECK1-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK1-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK1-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK1-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK1-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK1-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK1-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK1-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK1: cond.true5: +// CHECK1-NEXT: br label [[COND_END7:%.*]] +// CHECK1: cond.false6: +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END7]] +// CHECK1: cond.end7: +// CHECK1-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK1-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK1-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK1-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK1-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK1-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK1: cond.true7: +// CHECK1-NEXT: br label [[COND_END9:%.*]] +// CHECK1: cond.false8: +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END9]] +// CHECK1: cond.end9: +// CHECK1-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK1-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK1-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK1-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK1-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK1-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK1-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK1-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK1-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 +// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] +// CHECK1-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK1-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK1-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: land.lhs.true: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK1-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[CONV12:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV12]]) +// CHECK1-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: [[CMP13:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK1-NEXT: br i1 [[CMP13]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK1-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK1-NEXT: [[CMP14:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV15:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP19]], i32* [[CONV15]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK1-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* +// CHECK1-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK1-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP28:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK1-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK1-NEXT: [[TMP31:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP31]], i64 4) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP32:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP33:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP32]], [[TMP33]] +// CHECK1-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK1-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: [[ADD17:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK1-NEXT: store i64 [[ADD17]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK1-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: [[ADD18:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK1-NEXT: store i64 [[ADD18]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: [[CMP19:%.*]] = icmp sgt i64 [[TMP38]], [[TMP39]] +// CHECK1-NEXT: br i1 [[CMP19]], label [[COND_TRUE20:%.*]], label [[COND_FALSE21:%.*]] +// CHECK1: cond.true20: +// CHECK1-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: br label [[COND_END22:%.*]] +// CHECK1: cond.false21: +// CHECK1-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: br label [[COND_END22]] +// CHECK1: cond.end22: +// CHECK1-NEXT: [[COND23:%.*]] = phi i64 [ [[TMP40]], [[COND_TRUE20]] ], [ [[TMP41]], [[COND_FALSE21]] ] +// CHECK1-NEXT: store i64 [[COND23]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK1-NEXT: store i64 [[TMP42]], i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK1-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 +// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] +// CHECK1-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK1-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK1-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: land.lhs.true: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK1-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[TMP8]], i64* [[DOTOMP_LB]], align 8 +// CHECK1-NEXT: store i64 [[TMP9]], i64* [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK1-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP12:%.*]] = icmp ule i64 [[TMP13]], [[TMP14]] +// CHECK1-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[SUB13:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK1-NEXT: [[DIV14:%.*]] = sdiv i32 [[SUB13]], 1 +// CHECK1-NEXT: [[MUL15:%.*]] = mul nsw i32 1, [[DIV14]] +// CHECK1-NEXT: [[CONV16:%.*]] = sext i32 [[MUL15]] to i64 +// CHECK1-NEXT: [[DIV17:%.*]] = sdiv i64 [[TMP15]], [[CONV16]] +// CHECK1-NEXT: [[MUL18:%.*]] = mul nsw i64 [[DIV17]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL18]] +// CHECK1-NEXT: [[CONV19:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK1-NEXT: store i32 [[CONV19]], i32* [[I10]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[SUB20:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK1-NEXT: [[DIV21:%.*]] = sdiv i32 [[SUB20]], 1 +// CHECK1-NEXT: [[MUL22:%.*]] = mul nsw i32 1, [[DIV21]] +// CHECK1-NEXT: [[CONV23:%.*]] = sext i32 [[MUL22]] to i64 +// CHECK1-NEXT: [[DIV24:%.*]] = sdiv i64 [[TMP18]], [[CONV23]] +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[SUB25:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK1-NEXT: [[DIV26:%.*]] = sdiv i32 [[SUB25]], 1 +// CHECK1-NEXT: [[MUL27:%.*]] = mul nsw i32 1, [[DIV26]] +// CHECK1-NEXT: [[CONV28:%.*]] = sext i32 [[MUL27]] to i64 +// CHECK1-NEXT: [[MUL29:%.*]] = mul nsw i64 [[DIV24]], [[CONV28]] +// CHECK1-NEXT: [[SUB30:%.*]] = sub nsw i64 [[TMP17]], [[MUL29]] +// CHECK1-NEXT: [[MUL31:%.*]] = mul nsw i64 [[SUB30]], 1 +// CHECK1-NEXT: [[ADD32:%.*]] = add nsw i64 0, [[MUL31]] +// CHECK1-NEXT: [[CONV33:%.*]] = trunc i64 [[ADD32]] to i32 +// CHECK1-NEXT: store i32 [[CONV33]], i32* [[J11]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK1-NEXT: [[ADD34:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK1-NEXT: [[IDXPROM35:%.*]] = sext i32 [[TMP24]] to i64 +// CHECK1-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM35]] +// CHECK1-NEXT: store i32 [[ADD34]], i32* [[ARRAYIDX36]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: [[ADD37:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK1-NEXT: store i64 [[ADD37]], i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK1-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK1-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK1-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK1-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* +// CHECK1-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK1-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK1-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK1-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK1-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK1-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK1-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK1: cond.true11: +// CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END13:%.*]] +// CHECK1: cond.false12: +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END13]] +// CHECK1: cond.end13: +// CHECK1-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] +// CHECK1-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK1-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] +// CHECK1-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK1-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK2-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK2-NEXT: [[TMP2:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 +// CHECK2-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i64 0 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK2-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK2-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP23]], i32* [[CONV8]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK2-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP25]], i32* [[CONV9]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK2-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP30:%.*]] = inttoptr i64 [[TMP22]] to i8* +// CHECK2-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK2-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP24]] to i8* +// CHECK2-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 +// CHECK2-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP34:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 +// CHECK2-NEXT: [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK2-NEXT: [[TMP36:%.*]] = inttoptr i64 [[TMP26]] to i8* +// CHECK2-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 +// CHECK2-NEXT: [[TMP37:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 +// CHECK2-NEXT: [[TMP39:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP38]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP39]], i64 5) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK2-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK2-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK2-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP46]], [[TMP47]] +// CHECK2-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK2: cond.true14: +// CHECK2-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: br label [[COND_END16:%.*]] +// CHECK2: cond.false15: +// CHECK2-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END16]] +// CHECK2: cond.end16: +// CHECK2-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP48]], [[COND_TRUE14]] ], [ [[TMP49]], [[COND_FALSE15]] ] +// CHECK2-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP50]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP51:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) +// CHECK2-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 +// CHECK2-NEXT: br i1 [[TMP54]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK2: .omp.lastprivate.then: +// CHECK2-NEXT: [[TMP55:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK2-NEXT: store i32 [[TMP55]], i32* [[CONV1]], align 8 +// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK2: .omp.lastprivate.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: [[TMP56:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK2-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP56]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK2-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK2: omp.dispatch.cond: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK2-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK2-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK2-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK2-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK2-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK2: omp.dispatch.body: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK2-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK2-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK2-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK2: omp.dispatch.inc: +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK2-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK2: omp.dispatch.end: +// CHECK2-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK2-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK2: .omp.lastprivate.then: +// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK2-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 +// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK2: .omp.lastprivate.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK2-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK2-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK2-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK2: cond.true11: +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: br label [[COND_END13:%.*]] +// CHECK2: cond.false12: +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END13]] +// CHECK2: cond.end13: +// CHECK2-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK2-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK2-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK2-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK2-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK2-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK2-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK2-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK2-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK2-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK2: cond.true5: +// CHECK2-NEXT: br label [[COND_END7:%.*]] +// CHECK2: cond.false6: +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END7]] +// CHECK2: cond.end7: +// CHECK2-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK2-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK2-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK2-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK2-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK2-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK2-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK2-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK2-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK2-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK2-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK2-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK2-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK2-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK2: cond.true7: +// CHECK2-NEXT: br label [[COND_END9:%.*]] +// CHECK2: cond.false8: +// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END9]] +// CHECK2: cond.end9: +// CHECK2-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK2-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK2-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK2-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK2-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK2-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK2-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK2-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK2-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK2-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK2-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I8:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J9:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK2-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] +// CHECK2-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 +// CHECK2-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK2-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: land.lhs.true: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP10]], [[TMP11]] +// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP14]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK2-NEXT: [[CMP11:%.*]] = icmp slt i32 [[TMP15]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = zext i32 [[TMP17]] to i64 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV12:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP21]], i32* [[CONV12]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* +// CHECK2-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK2-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP22]] to i8* +// CHECK2-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK2-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK2-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i64 4) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK2-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK2-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[CMP16:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK2-NEXT: br i1 [[CMP16]], label [[COND_TRUE17:%.*]], label [[COND_FALSE18:%.*]] +// CHECK2: cond.true17: +// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: br label [[COND_END19:%.*]] +// CHECK2: cond.false18: +// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END19]] +// CHECK2: cond.end19: +// CHECK2-NEXT: [[COND20:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE17]] ], [ [[TMP43]], [[COND_FALSE18]] ] +// CHECK2-NEXT: store i32 [[COND20]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK2-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] +// CHECK2-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 +// CHECK2-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK2-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: land.lhs.true: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV8:%.*]] = trunc i64 [[TMP8]] to i32 +// CHECK2-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV9:%.*]] = trunc i64 [[TMP9]] to i32 +// CHECK2-NEXT: store i32 [[CONV8]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV9]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CONV12:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK2-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP13:%.*]] = icmp ule i64 [[CONV12]], [[TMP14]] +// CHECK2-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK2-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK2-NEXT: [[MUL16:%.*]] = mul nsw i32 1, [[DIV15]] +// CHECK2-NEXT: [[DIV17:%.*]] = sdiv i32 [[TMP15]], [[MUL16]] +// CHECK2-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL18]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I10]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[SUB19:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK2-NEXT: [[DIV20:%.*]] = sdiv i32 [[SUB19]], 1 +// CHECK2-NEXT: [[MUL21:%.*]] = mul nsw i32 1, [[DIV20]] +// CHECK2-NEXT: [[DIV22:%.*]] = sdiv i32 [[TMP18]], [[MUL21]] +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[SUB23:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK2-NEXT: [[DIV24:%.*]] = sdiv i32 [[SUB23]], 1 +// CHECK2-NEXT: [[MUL25:%.*]] = mul nsw i32 1, [[DIV24]] +// CHECK2-NEXT: [[MUL26:%.*]] = mul nsw i32 [[DIV22]], [[MUL25]] +// CHECK2-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP17]], [[MUL26]] +// CHECK2-NEXT: [[MUL28:%.*]] = mul nsw i32 [[SUB27]], 1 +// CHECK2-NEXT: [[ADD29:%.*]] = add nsw i32 0, [[MUL28]] +// CHECK2-NEXT: store i32 [[ADD29]], i32* [[J11]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK2-NEXT: [[ADD30:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK2-NEXT: [[IDXPROM31:%.*]] = sext i32 [[TMP24]] to i64 +// CHECK2-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM31]] +// CHECK2-NEXT: store i32 [[ADD30]], i32* [[ARRAYIDX32]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD33:%.*]] = add nsw i32 [[TMP25]], [[TMP26]] +// CHECK2-NEXT: store i32 [[ADD33]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK2-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK2-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK2-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK2-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* +// CHECK2-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK2-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK2-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK2: cond.true11: +// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: br label [[COND_END13:%.*]] +// CHECK2: cond.false12: +// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END13]] +// CHECK2: cond.end13: +// CHECK2-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] +// CHECK2-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK2-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK2-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] +// CHECK2-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK2-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK3-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK3-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK3-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK3-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK3-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) +// CHECK3-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK3-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK3-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK3-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK3-NEXT: store i32 [[TMP20]], i32* [[CONV8]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK3-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK3-NEXT: store i32 [[TMP22]], i32* [[CONV9]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK3-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK3-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK3-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK3-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK3-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP21]] to i8* +// CHECK3-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 +// CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK3-NEXT: [[TMP31:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 +// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK3-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP23]] to i8* +// CHECK3-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 8 +// CHECK3-NEXT: [[TMP34:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[TMP34]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP35]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP36]], i64 5) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK3-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK3-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP41]], [[TMP42]] +// CHECK3-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP43]], [[TMP44]] +// CHECK3-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK3: cond.true14: +// CHECK3-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: br label [[COND_END16:%.*]] +// CHECK3: cond.false15: +// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END16]] +// CHECK3: cond.end16: +// CHECK3-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP45]], [[COND_TRUE14]] ], [ [[TMP46]], [[COND_FALSE15]] ] +// CHECK3-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP47]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP48:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP49:%.*]] = load i32, i32* [[TMP48]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP49]]) +// CHECK3-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0 +// CHECK3-NEXT: br i1 [[TMP51]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK3: .omp.lastprivate.then: +// CHECK3-NEXT: [[TMP52:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK3-NEXT: store i32 [[TMP52]], i32* [[CONV1]], align 8 +// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK3: .omp.lastprivate.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK3-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK3-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK3-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK3-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK3-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK3-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK3: omp.dispatch.cond: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK3-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK3-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK3-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK3-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK3-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK3-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK3: omp.dispatch.body: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK3-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK3-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK3-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK3-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK3: omp.dispatch.inc: +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK3-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK3-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK3: omp.dispatch.end: +// CHECK3-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK3-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK3: .omp.lastprivate.then: +// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK3-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 +// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK3: .omp.lastprivate.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK3-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK3-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK3-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK3-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK3-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK3-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK3-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK3-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK3-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK3-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK3: cond.true11: +// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END13:%.*]] +// CHECK3: cond.false12: +// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END13]] +// CHECK3: cond.end13: +// CHECK3-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK3-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK3-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK3-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK3-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK3-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK3-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK3-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK3-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK3-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK3-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK3-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK3-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK3-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK3-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK3-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK3-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK3-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK3-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK3: cond.true5: +// CHECK3-NEXT: br label [[COND_END7:%.*]] +// CHECK3: cond.false6: +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END7]] +// CHECK3: cond.end7: +// CHECK3-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK3-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK3-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK3-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK3-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK3-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK3-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK3-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK3-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK3-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK3-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK3-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK3-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK3-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK3-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK3-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK3-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK3-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK3-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK3-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK3-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK3-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK3-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK3: cond.true7: +// CHECK3-NEXT: br label [[COND_END9:%.*]] +// CHECK3: cond.false8: +// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END9]] +// CHECK3: cond.end9: +// CHECK3-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK3-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK3-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK3-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK3-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK3-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK3-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK3-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK3-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK3-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK3-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK3-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK3-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK3-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK3-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK3-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I8:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J9:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK3-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] +// CHECK3-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 +// CHECK3-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK3-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: land.lhs.true: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP10]], [[TMP11]] +// CHECK3-NEXT: br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP14]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK3-NEXT: [[CMP11:%.*]] = icmp slt i32 [[TMP15]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = zext i32 [[TMP17]] to i64 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: [[CONV12:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK3-NEXT: store i32 [[TMP21]], i32* [[CONV12]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK3-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* +// CHECK3-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK3-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK3-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK3-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP22]] to i8* +// CHECK3-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK3-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK3-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK3-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i64 4) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK3-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK3-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK3-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: [[CMP16:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK3-NEXT: br i1 [[CMP16]], label [[COND_TRUE17:%.*]], label [[COND_FALSE18:%.*]] +// CHECK3: cond.true17: +// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: br label [[COND_END19:%.*]] +// CHECK3: cond.false18: +// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END19]] +// CHECK3: cond.end19: +// CHECK3-NEXT: [[COND20:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE17]] ], [ [[TMP43]], [[COND_FALSE18]] ] +// CHECK3-NEXT: store i32 [[COND20]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK3-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] +// CHECK3-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 +// CHECK3-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK3-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: land.lhs.true: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK3-NEXT: [[CONV8:%.*]] = trunc i64 [[TMP8]] to i32 +// CHECK3-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK3-NEXT: [[CONV9:%.*]] = trunc i64 [[TMP9]] to i32 +// CHECK3-NEXT: store i32 [[CONV8]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[CONV9]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[CONV12:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK3-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK3-NEXT: [[CMP13:%.*]] = icmp ule i64 [[CONV12]], [[TMP14]] +// CHECK3-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK3-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK3-NEXT: [[MUL16:%.*]] = mul nsw i32 1, [[DIV15]] +// CHECK3-NEXT: [[DIV17:%.*]] = sdiv i32 [[TMP15]], [[MUL16]] +// CHECK3-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL18]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I10]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[SUB19:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK3-NEXT: [[DIV20:%.*]] = sdiv i32 [[SUB19]], 1 +// CHECK3-NEXT: [[MUL21:%.*]] = mul nsw i32 1, [[DIV20]] +// CHECK3-NEXT: [[DIV22:%.*]] = sdiv i32 [[TMP18]], [[MUL21]] +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[SUB23:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK3-NEXT: [[DIV24:%.*]] = sdiv i32 [[SUB23]], 1 +// CHECK3-NEXT: [[MUL25:%.*]] = mul nsw i32 1, [[DIV24]] +// CHECK3-NEXT: [[MUL26:%.*]] = mul nsw i32 [[DIV22]], [[MUL25]] +// CHECK3-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP17]], [[MUL26]] +// CHECK3-NEXT: [[MUL28:%.*]] = mul nsw i32 [[SUB27]], 1 +// CHECK3-NEXT: [[ADD29:%.*]] = add nsw i32 0, [[MUL28]] +// CHECK3-NEXT: store i32 [[ADD29]], i32* [[J11]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK3-NEXT: [[ADD30:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK3-NEXT: [[IDXPROM31:%.*]] = sext i32 [[TMP24]] to i64 +// CHECK3-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM31]] +// CHECK3-NEXT: store i32 [[ADD30]], i32* [[ARRAYIDX32]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD33:%.*]] = add nsw i32 [[TMP25]], [[TMP26]] +// CHECK3-NEXT: store i32 [[ADD33]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK3-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK3-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK3-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK3-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK3-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK3-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK3-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK3-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK3-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK3-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK3-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK3-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK3-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK3-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* +// CHECK3-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK3-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK3-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK3-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK3: cond.true11: +// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END13:%.*]] +// CHECK3: cond.false12: +// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END13]] +// CHECK3: cond.end13: +// CHECK3-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] +// CHECK3-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK3-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK3-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK3-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK3-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK3-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK3-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK3-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK3-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK3-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK3-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] +// CHECK3-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK3-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK4-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK4-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK4-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK4-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK4-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK4-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK4-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK4-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK4-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK4-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK4-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK4-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* +// CHECK4-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK4-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +// CHECK4-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] +// CHECK4-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK4: cond.true11: +// CHECK4-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: br label [[COND_END13:%.*]] +// CHECK4: cond.false12: +// CHECK4-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END13]] +// CHECK4: cond.end13: +// CHECK4-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] +// CHECK4-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) +// CHECK4-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK4-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK4: .omp.lastprivate.then: +// CHECK4-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK4: .omp.lastprivate.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: [[TMP54:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP54]]) +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK4-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK4: omp.dispatch.cond: +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK4: omp.dispatch.body: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK4-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK4-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK4-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK4: omp.dispatch.inc: +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK4: omp.dispatch.end: +// CHECK4-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK4-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK4: .omp.lastprivate.then: +// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK4: .omp.lastprivate.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK4-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK4-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK4-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK4-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK4-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK4-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK4-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK4-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK4-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK4: cond.true10: +// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: br label [[COND_END12:%.*]] +// CHECK4: cond.false11: +// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END12]] +// CHECK4: cond.end12: +// CHECK4-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK4-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK4-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK4-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK4-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK4-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK4-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK4-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK4-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK4-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK4-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK4-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK4: cond.true5: +// CHECK4-NEXT: br label [[COND_END7:%.*]] +// CHECK4: cond.false6: +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END7]] +// CHECK4: cond.end7: +// CHECK4-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK4-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK4-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK4-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK4-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK4-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK4-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK4-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK4-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK4-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK4-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK4-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK4-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK4: cond.true6: +// CHECK4-NEXT: br label [[COND_END8:%.*]] +// CHECK4: cond.false7: +// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END8]] +// CHECK4: cond.end8: +// CHECK4-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK4-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK4-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK4-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK4-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK4-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK4-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK4-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK4-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK4-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J10:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK4-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK4-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK4-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK4-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK4-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: land.lhs.true: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK4-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK4-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK4-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK4-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK4-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK4-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK4-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK4-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK4-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK4-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK4-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK4-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK4-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK4-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK4-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK4-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK4-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK4-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK4-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK4-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK4-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK4-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK4-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK4-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK4: cond.true18: +// CHECK4-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: br label [[COND_END20:%.*]] +// CHECK4: cond.false19: +// CHECK4-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: br label [[COND_END20]] +// CHECK4: cond.end20: +// CHECK4-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK4-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK4-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J12:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK4-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK4-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK4-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK4-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK4-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: land.lhs.true: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK4-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK4-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK4-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK4-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK4-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK4-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK4-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK4-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK4-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK4-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK4-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK4-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK4-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK4-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK4-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK4-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK4-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK4-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK4-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK4-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK4-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK4-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK4-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK4-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK4-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK4-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK4-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK4-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK4-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK4-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK4-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK4-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK4-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK4-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK4-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK4-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK4-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK4-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK4-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK4-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK4-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK4-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK4-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK4-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] +// CHECK4-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK4: cond.true10: +// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: br label [[COND_END12:%.*]] +// CHECK4: cond.false11: +// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END12]] +// CHECK4: cond.end12: +// CHECK4-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] +// CHECK4-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK4-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK4-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK4-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK4-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK5-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK5-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK5-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK5-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK5-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK5-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK5-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK5-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK5-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK5-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK5-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK5-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK5-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK5-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK5-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK5-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* +// CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK5-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +// CHECK5-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK5-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] +// CHECK5-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK5: cond.true11: +// CHECK5-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: br label [[COND_END13:%.*]] +// CHECK5: cond.false12: +// CHECK5-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END13]] +// CHECK5: cond.end13: +// CHECK5-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] +// CHECK5-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) +// CHECK5-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK5-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK5: .omp.lastprivate.then: +// CHECK5-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK5: .omp.lastprivate.done: +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: [[TMP54:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK5-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP54]]) +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK5-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK5-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK5: omp.dispatch.cond: +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK5-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK5: omp.dispatch.body: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK5-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK5-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK5-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK5: omp.dispatch.inc: +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK5-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK5: omp.dispatch.end: +// CHECK5-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK5-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK5: .omp.lastprivate.then: +// CHECK5-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK5: .omp.lastprivate.done: +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK5-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK5-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK5-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK5-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK5-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK5-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK5-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK5-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK5-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK5-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK5-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK5: cond.true10: +// CHECK5-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: br label [[COND_END12:%.*]] +// CHECK5: cond.false11: +// CHECK5-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END12]] +// CHECK5: cond.end12: +// CHECK5-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK5-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK5-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK5-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK5-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK5-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK5-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK5-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK5-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK5-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK5-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK5-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK5: cond.true5: +// CHECK5-NEXT: br label [[COND_END7:%.*]] +// CHECK5: cond.false6: +// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END7]] +// CHECK5: cond.end7: +// CHECK5-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK5-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK5-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK5-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK5-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK5-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK5-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK5-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK5-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK5-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK5-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK5-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK5-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK5-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK5-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK5: cond.true6: +// CHECK5-NEXT: br label [[COND_END8:%.*]] +// CHECK5: cond.false7: +// CHECK5-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END8]] +// CHECK5: cond.end8: +// CHECK5-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK5-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK5-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK5-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK5-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK5-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK5-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK5-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK5-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK5-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK5-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK5-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[J10:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK5-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK5-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK5-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK5-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK5-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: land.lhs.true: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK5-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK5-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK5-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK5-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK5-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK5-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK5-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK5-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK5-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK5-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK5-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK5-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK5-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK5-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK5-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK5-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK5-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK5-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK5-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK5-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK5-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK5-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK5-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK5-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK5-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK5-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK5-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK5-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK5: cond.true18: +// CHECK5-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK5-NEXT: br label [[COND_END20:%.*]] +// CHECK5: cond.false19: +// CHECK5-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: br label [[COND_END20]] +// CHECK5: cond.end20: +// CHECK5-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK5-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK5-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[J12:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK5-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK5-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK5-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK5-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK5-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: land.lhs.true: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK5-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK5-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK5-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK5-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK5-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK5-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK5-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK5-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK5-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK5-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK5-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK5-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK5-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK5-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK5-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK5-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK5-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK5-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK5-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK5-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK5-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK5-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK5-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK5-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK5-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK5-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK5-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK5-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK5-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK5-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK5-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK5-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK5-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK5-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK5-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK5-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK5-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK5-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK5-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK5-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK5-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK5-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK5-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK5-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK5-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK5-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK5-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK5-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK5-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] +// CHECK5-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK5: cond.true10: +// CHECK5-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: br label [[COND_END12:%.*]] +// CHECK5: cond.false11: +// CHECK5-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END12]] +// CHECK5: cond.end12: +// CHECK5-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] +// CHECK5-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK5-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK5-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK5-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK5-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK5-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK6-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK6-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK6-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK6-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK6-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK6-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK6-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK6-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* +// CHECK6-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK6-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK6-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK6-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK6-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK6-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK6-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK6-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK6-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK6-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK6-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK6-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK6-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK6: cond.true11: +// CHECK6-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: br label [[COND_END13:%.*]] +// CHECK6: cond.false12: +// CHECK6-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END13]] +// CHECK6: cond.end13: +// CHECK6-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] +// CHECK6-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK6-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK6-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK6: .omp.lastprivate.then: +// CHECK6-NEXT: [[TMP50:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP50]], i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK6: .omp.lastprivate.done: +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK6-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK6-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK6: omp.dispatch.cond: +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK6-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK6: omp.dispatch.body: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK6-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK6-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK6-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK6: omp.dispatch.inc: +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK6-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK6: omp.dispatch.end: +// CHECK6-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK6-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK6: .omp.lastprivate.then: +// CHECK6-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK6: .omp.lastprivate.done: +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK6-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK6-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK6-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK6-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK6-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK6-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK6-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK6-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK6-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK6-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK6-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK6: cond.true10: +// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: br label [[COND_END12:%.*]] +// CHECK6: cond.false11: +// CHECK6-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END12]] +// CHECK6: cond.end12: +// CHECK6-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK6-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK6-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK6-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK6-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK6-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK6-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK6-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK6-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK6-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK6-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK6-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK6: cond.true5: +// CHECK6-NEXT: br label [[COND_END7:%.*]] +// CHECK6: cond.false6: +// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END7]] +// CHECK6: cond.end7: +// CHECK6-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK6-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK6-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK6-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK6-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK6-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK6-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK6-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK6-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK6-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK6-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK6-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK6-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK6-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK6-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK6: cond.true6: +// CHECK6-NEXT: br label [[COND_END8:%.*]] +// CHECK6: cond.false7: +// CHECK6-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END8]] +// CHECK6: cond.end8: +// CHECK6-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK6-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK6-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK6-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK6-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK6-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK6-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK6-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK6-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK6-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK6-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK6-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[J10:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK6-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK6-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK6-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK6-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK6-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: land.lhs.true: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK6-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK6-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK6-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK6-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK6-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK6-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK6-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK6-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK6-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK6-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK6-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK6-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK6-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK6-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK6-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK6-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK6-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK6-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK6-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK6-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK6-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK6-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK6-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK6-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK6-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK6-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK6-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK6-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK6-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK6-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK6-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK6-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK6-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK6-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK6-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK6-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK6-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK6-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK6: cond.true18: +// CHECK6-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK6-NEXT: br label [[COND_END20:%.*]] +// CHECK6: cond.false19: +// CHECK6-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK6-NEXT: br label [[COND_END20]] +// CHECK6: cond.end20: +// CHECK6-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK6-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK6-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK6-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[J12:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK6-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK6-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK6-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK6-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK6-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: land.lhs.true: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK6-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK6-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK6-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK6-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK6-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK6-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK6-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK6-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK6-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK6-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK6-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK6-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK6-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK6-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK6-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK6-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK6-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK6-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK6-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK6-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK6-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK6-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK6-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK6-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK6-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK6-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK6-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK6-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK6-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK6-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK6-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK6-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK6-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK6-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK6-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK6-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK6-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK6-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK6-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK6-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK6-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK6-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK6-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK6-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK6-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK6-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK6-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK6-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK6-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK6-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK6-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] +// CHECK6-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK6: cond.true10: +// CHECK6-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: br label [[COND_END12:%.*]] +// CHECK6: cond.false11: +// CHECK6-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END12]] +// CHECK6: cond.end12: +// CHECK6-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] +// CHECK6-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK6-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK6-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK6-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK6-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK6-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK7-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK7-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK7: .execute: +// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK7-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK7: .omp.deinit: +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK7-NEXT: br label [[DOTEXIT:%.*]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK7-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK7-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK7-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK7-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK7-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK7-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK7-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 +// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK7-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK7-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* +// CHECK7-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK7-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK7-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK7-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK7-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK7-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK7-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK7-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK7-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK7-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK7-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK7-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK7-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK7-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK7-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK7-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK7-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK7-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK7-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK7: cond.true11: +// CHECK7-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: br label [[COND_END13:%.*]] +// CHECK7: cond.false12: +// CHECK7-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END13]] +// CHECK7: cond.end13: +// CHECK7-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] +// CHECK7-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK7-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK7-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK7: .omp.lastprivate.then: +// CHECK7-NEXT: [[TMP50:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP50]], i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK7: .omp.lastprivate.done: +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK7-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK7-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK7-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK7: omp.dispatch.cond: +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK7-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK7-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK7: omp.dispatch.body: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK7-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK7-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK7-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK7-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK7: omp.body.continue: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK7-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK7: omp.dispatch.inc: +// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK7-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK7-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK7: omp.dispatch.end: +// CHECK7-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK7-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK7-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK7: .omp.lastprivate.then: +// CHECK7-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK7: .omp.lastprivate.done: +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK7-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK7: .execute: +// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK7-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK7: .omp.deinit: +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK7-NEXT: br label [[DOTEXIT:%.*]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK7-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK7-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK7-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK7-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK7-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK7-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK7-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK7-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK7-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK7-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK7-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK7-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK7-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK7-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK7-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK7-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK7-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK7-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK7-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK7-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK7-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK7: cond.true10: +// CHECK7-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: br label [[COND_END12:%.*]] +// CHECK7: cond.false11: +// CHECK7-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END12]] +// CHECK7: cond.end12: +// CHECK7-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK7-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK7-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK7-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK7-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK7-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK7-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK7-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK7-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK7: omp.body.continue: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK7-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK7-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK7-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK7: .execute: +// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK7-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK7: .omp.deinit: +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK7-NEXT: br label [[DOTEXIT:%.*]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK7-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK7-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK7-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK7-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK7-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK7-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK7-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK7-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK7-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK7-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK7-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK7-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK7: cond.true5: +// CHECK7-NEXT: br label [[COND_END7:%.*]] +// CHECK7: cond.false6: +// CHECK7-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END7]] +// CHECK7: cond.end7: +// CHECK7-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK7-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK7-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK7-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK7: omp.body.continue: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK7-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK7-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK7-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK7: .execute: +// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK7-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK7: .omp.deinit: +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK7-NEXT: br label [[DOTEXIT:%.*]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK7-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK7-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK7-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK7-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK7-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK7-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK7-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK7-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK7-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK7-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK7-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK7-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK7-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK7-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK7-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK7-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK7-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK7: cond.true6: +// CHECK7-NEXT: br label [[COND_END8:%.*]] +// CHECK7: cond.false7: +// CHECK7-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END8]] +// CHECK7: cond.end8: +// CHECK7-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK7-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK7-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK7-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK7-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK7-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK7-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK7-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK7-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK7-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK7-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK7-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK7-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK7: omp.body.continue: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK7-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK7-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK7: .execute: +// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK7-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK7: .omp.deinit: +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK7-NEXT: br label [[DOTEXIT:%.*]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[J10:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK7-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK7-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK7-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK7-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK7-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: land.lhs.true: +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK7-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK7-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK7-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK7-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK7-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK7-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK7-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK7-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK7-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK7-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK7-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK7-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK7-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK7-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK7-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK7-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK7-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK7-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK7-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK7-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK7-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK7-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK7-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK7-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK7-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK7-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK7-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK7-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK7-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK7-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK7-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK7-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK7-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK7-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK7-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK7: cond.true18: +// CHECK7-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK7-NEXT: br label [[COND_END20:%.*]] +// CHECK7: cond.false19: +// CHECK7-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: br label [[COND_END20]] +// CHECK7: cond.end20: +// CHECK7-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK7-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK7-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[J12:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK7-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK7-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK7-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK7-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK7-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: land.lhs.true: +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK7-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK7-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK7-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK7-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK7-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK7-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK7-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK7-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK7-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK7-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK7-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK7-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK7-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK7-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK7-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK7-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK7-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK7-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK7-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK7-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK7-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK7-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK7-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK7-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK7-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK7-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK7-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK7-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK7-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK7-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK7-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK7-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK7-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK7-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK7-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK7-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK7: omp.body.continue: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK7-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK7-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK7-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK7-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK7: .execute: +// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK7-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK7: .omp.deinit: +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK7-NEXT: br label [[DOTEXIT:%.*]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK7-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK7-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK7-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK7-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK7-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK7-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK7-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK7-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK7-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK7-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK7-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK7-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK7-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK7-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK7-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK7-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK7-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK7-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK7-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK7-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK7-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK7-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK7-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK7-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK7-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] +// CHECK7-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK7: cond.true10: +// CHECK7-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: br label [[COND_END12:%.*]] +// CHECK7: cond.false11: +// CHECK7-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END12]] +// CHECK7: cond.end12: +// CHECK7-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] +// CHECK7-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK7-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK7-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK7-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK7-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK7-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK7: omp.body.continue: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK7-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK8-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK8-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK8-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK8: .execute: +// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK8-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK8: .omp.deinit: +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK8-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK8-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK8-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK8-NEXT: [[TMP2:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 +// CHECK8-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK8-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 +// CHECK8-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i64 0 +// CHECK8-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK8-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK8-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK8-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK8-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK8-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK8-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 +// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP23]], i32* [[CONV8]], align 4 +// CHECK8-NEXT: [[TMP24:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK8-NEXT: [[TMP25:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK8-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP25]], i32* [[CONV9]], align 4 +// CHECK8-NEXT: [[TMP26:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK8-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK8-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK8-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK8-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK8-NEXT: [[TMP30:%.*]] = inttoptr i64 [[TMP22]] to i8* +// CHECK8-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK8-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK8-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP24]] to i8* +// CHECK8-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 +// CHECK8-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK8-NEXT: [[TMP34:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK8-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 +// CHECK8-NEXT: [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK8-NEXT: [[TMP36:%.*]] = inttoptr i64 [[TMP26]] to i8* +// CHECK8-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 +// CHECK8-NEXT: [[TMP37:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 +// CHECK8-NEXT: [[TMP39:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP38]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP39]], i64 5) +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK8-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK8-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK8-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK8-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP46]], [[TMP47]] +// CHECK8-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK8: cond.true14: +// CHECK8-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK8-NEXT: br label [[COND_END16:%.*]] +// CHECK8: cond.false15: +// CHECK8-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END16]] +// CHECK8: cond.end16: +// CHECK8-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP48]], [[COND_TRUE14]] ], [ [[TMP49]], [[COND_FALSE15]] ] +// CHECK8-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP50]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP51:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) +// CHECK8-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 +// CHECK8-NEXT: br i1 [[TMP54]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK8: .omp.lastprivate.then: +// CHECK8-NEXT: [[TMP55:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK8-NEXT: store i32 [[TMP55]], i32* [[CONV1]], align 8 +// CHECK8-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK8: .omp.lastprivate.done: +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: [[TMP56:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK8-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP56]]) +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK8-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK8-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK8-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK8-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK8-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK8: omp.dispatch.cond: +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK8-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK8-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK8-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK8-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK8-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK8: omp.dispatch.body: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK8-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK8-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK8-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK8: omp.body.continue: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK8-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK8: omp.dispatch.inc: +// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK8-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK8-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK8: omp.dispatch.end: +// CHECK8-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK8-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK8-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK8: .omp.lastprivate.then: +// CHECK8-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK8-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 +// CHECK8-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK8: .omp.lastprivate.done: +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK8-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK8-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK8: .execute: +// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK8: .omp.deinit: +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK8-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK8-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK8-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK8-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK8-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK8-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK8-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK8-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK8-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK8-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK8-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK8-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK8-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK8-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK8-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK8-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK8-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK8-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK8-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK8-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK8-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK8-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK8-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK8: cond.true11: +// CHECK8-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: br label [[COND_END13:%.*]] +// CHECK8: cond.false12: +// CHECK8-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END13]] +// CHECK8: cond.end13: +// CHECK8-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK8-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK8-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK8-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK8-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK8-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK8-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK8-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK8-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK8-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK8-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK8-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK8: omp.body.continue: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK8-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK8-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK8: .execute: +// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK8: .omp.deinit: +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK8-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK8-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK8-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK8-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK8-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK8-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK8-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK8-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK8-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK8-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK8-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK8-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK8-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK8-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK8-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK8: cond.true5: +// CHECK8-NEXT: br label [[COND_END7:%.*]] +// CHECK8: cond.false6: +// CHECK8-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END7]] +// CHECK8: cond.end7: +// CHECK8-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK8-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK8-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK8-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK8-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK8-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK8-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK8: omp.body.continue: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK8-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK8-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK8-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK8: .execute: +// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK8: .omp.deinit: +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK8-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK8-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK8-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK8-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK8-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK8-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK8-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK8-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK8-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK8-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK8-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK8-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK8-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK8-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK8-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK8-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK8-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK8-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK8-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK8-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK8: cond.true7: +// CHECK8-NEXT: br label [[COND_END9:%.*]] +// CHECK8: cond.false8: +// CHECK8-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END9]] +// CHECK8: cond.end9: +// CHECK8-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK8-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK8-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK8-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK8-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK8-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK8-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK8-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK8-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK8-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK8-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK8-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK8-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK8-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK8-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK8-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK8: omp.body.continue: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK8-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK8-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK8: .execute: +// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK8: .omp.deinit: +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK8-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 +// CHECK8-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] +// CHECK8-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK8-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK8-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: land.lhs.true: +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK8-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK8-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK8-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: [[CONV12:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV12]]) +// CHECK8-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK8-NEXT: [[CMP13:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK8-NEXT: br i1 [[CMP13]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK8-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK8-NEXT: [[CMP14:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK8-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK8-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV15:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP19]], i32* [[CONV15]], align 4 +// CHECK8-NEXT: [[TMP20:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK8-NEXT: [[TMP21:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK8-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK8-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK8-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK8-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* +// CHECK8-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK8-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK8-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK8-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK8-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK8-NEXT: [[TMP28:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK8-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK8-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK8-NEXT: [[TMP31:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP31]], i64 4) +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP32:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: [[TMP33:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK8-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP32]], [[TMP33]] +// CHECK8-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK8-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK8-NEXT: [[ADD17:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK8-NEXT: store i64 [[ADD17]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK8-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK8-NEXT: [[ADD18:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK8-NEXT: store i64 [[ADD18]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK8-NEXT: [[CMP19:%.*]] = icmp sgt i64 [[TMP38]], [[TMP39]] +// CHECK8-NEXT: br i1 [[CMP19]], label [[COND_TRUE20:%.*]], label [[COND_FALSE21:%.*]] +// CHECK8: cond.true20: +// CHECK8-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK8-NEXT: br label [[COND_END22:%.*]] +// CHECK8: cond.false21: +// CHECK8-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: br label [[COND_END22]] +// CHECK8: cond.end22: +// CHECK8-NEXT: [[COND23:%.*]] = phi i64 [ [[TMP40]], [[COND_TRUE20]] ], [ [[TMP41]], [[COND_FALSE21]] ] +// CHECK8-NEXT: store i64 [[COND23]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK8-NEXT: store i64 [[TMP42]], i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK8-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 +// CHECK8-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] +// CHECK8-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK8-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK8-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: land.lhs.true: +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK8-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK8-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK8-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK8-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[TMP8]], i64* [[DOTOMP_LB]], align 8 +// CHECK8-NEXT: store i64 [[TMP9]], i64* [[DOTOMP_UB]], align 8 +// CHECK8-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK8-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK8-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CMP12:%.*]] = icmp ule i64 [[TMP13]], [[TMP14]] +// CHECK8-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[SUB13:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK8-NEXT: [[DIV14:%.*]] = sdiv i32 [[SUB13]], 1 +// CHECK8-NEXT: [[MUL15:%.*]] = mul nsw i32 1, [[DIV14]] +// CHECK8-NEXT: [[CONV16:%.*]] = sext i32 [[MUL15]] to i64 +// CHECK8-NEXT: [[DIV17:%.*]] = sdiv i64 [[TMP15]], [[CONV16]] +// CHECK8-NEXT: [[MUL18:%.*]] = mul nsw i64 [[DIV17]], 1 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL18]] +// CHECK8-NEXT: [[CONV19:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK8-NEXT: store i32 [[CONV19]], i32* [[I10]], align 4 +// CHECK8-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[SUB20:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK8-NEXT: [[DIV21:%.*]] = sdiv i32 [[SUB20]], 1 +// CHECK8-NEXT: [[MUL22:%.*]] = mul nsw i32 1, [[DIV21]] +// CHECK8-NEXT: [[CONV23:%.*]] = sext i32 [[MUL22]] to i64 +// CHECK8-NEXT: [[DIV24:%.*]] = sdiv i64 [[TMP18]], [[CONV23]] +// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[SUB25:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK8-NEXT: [[DIV26:%.*]] = sdiv i32 [[SUB25]], 1 +// CHECK8-NEXT: [[MUL27:%.*]] = mul nsw i32 1, [[DIV26]] +// CHECK8-NEXT: [[CONV28:%.*]] = sext i32 [[MUL27]] to i64 +// CHECK8-NEXT: [[MUL29:%.*]] = mul nsw i64 [[DIV24]], [[CONV28]] +// CHECK8-NEXT: [[SUB30:%.*]] = sub nsw i64 [[TMP17]], [[MUL29]] +// CHECK8-NEXT: [[MUL31:%.*]] = mul nsw i64 [[SUB30]], 1 +// CHECK8-NEXT: [[ADD32:%.*]] = add nsw i64 0, [[MUL31]] +// CHECK8-NEXT: [[CONV33:%.*]] = trunc i64 [[ADD32]] to i32 +// CHECK8-NEXT: store i32 [[CONV33]], i32* [[J11]], align 4 +// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK8-NEXT: [[ADD34:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 +// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK8-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK8-NEXT: [[IDXPROM35:%.*]] = sext i32 [[TMP24]] to i64 +// CHECK8-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM35]] +// CHECK8-NEXT: store i32 [[ADD34]], i32* [[ARRAYIDX36]], align 4 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK8: omp.body.continue: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK8-NEXT: [[ADD37:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK8-NEXT: store i64 [[ADD37]], i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK8-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK8-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK8-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK8: .execute: +// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK8: .omp.deinit: +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK8-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK8-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK8-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK8-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK8-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK8-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK8-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK8-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK8-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK8-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK8-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK8-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK8-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK8-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK8-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK8-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK8-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK8-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK8-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK8-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK8-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* +// CHECK8-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK8-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK8-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK8-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK8-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK8-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK8-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK8: cond.true11: +// CHECK8-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: br label [[COND_END13:%.*]] +// CHECK8: cond.false12: +// CHECK8-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END13]] +// CHECK8: cond.end13: +// CHECK8-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] +// CHECK8-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK8-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK8-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK8-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK8-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK8-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK8-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 +// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK8-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK8-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] +// CHECK8-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK8: omp.body.continue: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK8-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK9-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK9-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK9-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK9-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK9-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK9-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK9: .execute: +// CHECK9-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK9-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK9-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK9-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK9-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK9: .omp.deinit: +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK9-NEXT: br label [[DOTEXIT:%.*]] +// CHECK9: .exit: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK9-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK9-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK9-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK9-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK9-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) +// CHECK9-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK9-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK9-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK9-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK9-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK9-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK9-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64 +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK9-NEXT: store i32 [[TMP20]], i32* [[CONV8]], align 4 +// CHECK9-NEXT: [[TMP21:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK9-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK9-NEXT: store i32 [[TMP22]], i32* [[CONV9]], align 4 +// CHECK9-NEXT: [[TMP23:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK9-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK9-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK9-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK9-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK9-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK9-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK9-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK9-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP21]] to i8* +// CHECK9-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 +// CHECK9-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK9-NEXT: [[TMP31:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK9-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 +// CHECK9-NEXT: [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK9-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP23]] to i8* +// CHECK9-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 8 +// CHECK9-NEXT: [[TMP34:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP35:%.*]] = load i32, i32* [[TMP34]], align 4 +// CHECK9-NEXT: [[TMP36:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP35]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP36]], i64 5) +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK9-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK9-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP41]], [[TMP42]] +// CHECK9-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK9-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP43]], [[TMP44]] +// CHECK9-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK9: cond.true14: +// CHECK9-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK9-NEXT: br label [[COND_END16:%.*]] +// CHECK9: cond.false15: +// CHECK9-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END16]] +// CHECK9: cond.end16: +// CHECK9-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP45]], [[COND_TRUE14]] ], [ [[TMP46]], [[COND_FALSE15]] ] +// CHECK9-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP47]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP48:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP49:%.*]] = load i32, i32* [[TMP48]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP49]]) +// CHECK9-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0 +// CHECK9-NEXT: br i1 [[TMP51]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK9: .omp.lastprivate.then: +// CHECK9-NEXT: [[TMP52:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK9-NEXT: store i32 [[TMP52]], i32* [[CONV1]], align 8 +// CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK9: .omp.lastprivate.done: +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK9-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK9-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK9-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK9-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK9-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK9-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK9-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK9-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK9: omp.dispatch.cond: +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK9-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK9-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK9-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK9-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK9-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK9: omp.dispatch.body: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK9-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK9-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK9-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK9-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK9: omp.body.continue: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK9-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK9: omp.dispatch.inc: +// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK9-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK9-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK9: omp.dispatch.end: +// CHECK9-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK9-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK9-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK9: .omp.lastprivate.then: +// CHECK9-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK9-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 +// CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK9: .omp.lastprivate.done: +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK9-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK9-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK9-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK9-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK9: .execute: +// CHECK9-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK9-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK9-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK9: .omp.deinit: +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK9-NEXT: br label [[DOTEXIT:%.*]] +// CHECK9: .exit: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK9-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK9-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK9-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK9-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK9-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK9-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK9-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK9-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK9-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK9-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK9-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK9-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK9-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK9-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK9-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK9-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK9-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK9-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK9-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK9-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK9-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK9-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK9-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK9-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK9-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK9: cond.true11: +// CHECK9-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: br label [[COND_END13:%.*]] +// CHECK9: cond.false12: +// CHECK9-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END13]] +// CHECK9: cond.end13: +// CHECK9-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK9-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK9-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK9-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK9-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK9-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK9-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK9-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK9-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK9-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK9-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK9-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK9-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK9-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK9-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK9: omp.body.continue: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK9-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK9-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK9: .execute: +// CHECK9-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK9-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK9: .omp.deinit: +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK9-NEXT: br label [[DOTEXIT:%.*]] +// CHECK9: .exit: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK9-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK9-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK9-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK9-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK9-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK9-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK9-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK9-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK9-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK9-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK9-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK9-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK9-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK9-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK9: cond.true5: +// CHECK9-NEXT: br label [[COND_END7:%.*]] +// CHECK9: cond.false6: +// CHECK9-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END7]] +// CHECK9: cond.end7: +// CHECK9-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK9-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK9-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK9-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK9-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK9-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK9-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK9-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK9-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK9: omp.body.continue: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK9-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK9-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK9-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK9-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK9-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK9: .execute: +// CHECK9-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK9-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK9-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK9: .omp.deinit: +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK9-NEXT: br label [[DOTEXIT:%.*]] +// CHECK9: .exit: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK9-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK9-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK9-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK9-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK9-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK9-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK9-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK9-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK9-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK9-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK9-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK9-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK9-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK9-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK9-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK9-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK9-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK9-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK9-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK9-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK9-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK9: cond.true7: +// CHECK9-NEXT: br label [[COND_END9:%.*]] +// CHECK9: cond.false8: +// CHECK9-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END9]] +// CHECK9: cond.end9: +// CHECK9-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK9-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK9-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK9-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK9-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK9-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK9-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK9-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK9-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK9-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK9-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK9-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK9-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK9-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK9-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK9-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK9-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK9-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK9: omp.body.continue: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK9-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK9-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK9-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK9-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK9-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK9: .execute: +// CHECK9-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK9-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK9-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK9: .omp.deinit: +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK9-NEXT: br label [[DOTEXIT:%.*]] +// CHECK9: .exit: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK9-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK9-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK9-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 +// CHECK9-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] +// CHECK9-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK9-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK9-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: land.lhs.true: +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK9-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK9-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK9-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK9-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: [[CONV12:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV12]]) +// CHECK9-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK9-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK9-NEXT: [[CMP13:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK9-NEXT: br i1 [[CMP13]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK9-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK9-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK9-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK9-NEXT: [[CMP14:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK9-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK9-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: [[CONV15:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK9-NEXT: store i32 [[TMP19]], i32* [[CONV15]], align 4 +// CHECK9-NEXT: [[TMP20:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK9-NEXT: [[TMP21:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK9-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK9-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK9-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK9-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* +// CHECK9-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK9-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK9-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK9-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK9-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK9-NEXT: [[TMP28:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK9-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK9-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK9-NEXT: [[TMP31:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP31]], i64 4) +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP32:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK9-NEXT: [[TMP33:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK9-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP32]], [[TMP33]] +// CHECK9-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_IV]], align 8 +// CHECK9-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK9-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK9-NEXT: [[ADD17:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK9-NEXT: store i64 [[ADD17]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK9-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK9-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK9-NEXT: [[ADD18:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK9-NEXT: store i64 [[ADD18]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK9-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK9-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK9-NEXT: [[CMP19:%.*]] = icmp sgt i64 [[TMP38]], [[TMP39]] +// CHECK9-NEXT: br i1 [[CMP19]], label [[COND_TRUE20:%.*]], label [[COND_FALSE21:%.*]] +// CHECK9: cond.true20: +// CHECK9-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK9-NEXT: br label [[COND_END22:%.*]] +// CHECK9: cond.false21: +// CHECK9-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK9-NEXT: br label [[COND_END22]] +// CHECK9: cond.end22: +// CHECK9-NEXT: [[COND23:%.*]] = phi i64 [ [[TMP40]], [[COND_TRUE20]] ], [ [[TMP41]], [[COND_FALSE21]] ] +// CHECK9-NEXT: store i64 [[COND23]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK9-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK9-NEXT: store i64 [[TMP42]], i64* [[DOTOMP_IV]], align 8 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK9-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK9-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK9-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 +// CHECK9-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] +// CHECK9-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK9-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK9-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: land.lhs.true: +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK9-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK9-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK9-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK9-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK9-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[TMP8]], i64* [[DOTOMP_LB]], align 8 +// CHECK9-NEXT: store i64 [[TMP9]], i64* [[DOTOMP_UB]], align 8 +// CHECK9-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK9-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK9-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK9-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: [[CMP12:%.*]] = icmp ule i64 [[TMP13]], [[TMP14]] +// CHECK9-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[SUB13:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK9-NEXT: [[DIV14:%.*]] = sdiv i32 [[SUB13]], 1 +// CHECK9-NEXT: [[MUL15:%.*]] = mul nsw i32 1, [[DIV14]] +// CHECK9-NEXT: [[CONV16:%.*]] = sext i32 [[MUL15]] to i64 +// CHECK9-NEXT: [[DIV17:%.*]] = sdiv i64 [[TMP15]], [[CONV16]] +// CHECK9-NEXT: [[MUL18:%.*]] = mul nsw i64 [[DIV17]], 1 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL18]] +// CHECK9-NEXT: [[CONV19:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK9-NEXT: store i32 [[CONV19]], i32* [[I10]], align 4 +// CHECK9-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK9-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[SUB20:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK9-NEXT: [[DIV21:%.*]] = sdiv i32 [[SUB20]], 1 +// CHECK9-NEXT: [[MUL22:%.*]] = mul nsw i32 1, [[DIV21]] +// CHECK9-NEXT: [[CONV23:%.*]] = sext i32 [[MUL22]] to i64 +// CHECK9-NEXT: [[DIV24:%.*]] = sdiv i64 [[TMP18]], [[CONV23]] +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[SUB25:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK9-NEXT: [[DIV26:%.*]] = sdiv i32 [[SUB25]], 1 +// CHECK9-NEXT: [[MUL27:%.*]] = mul nsw i32 1, [[DIV26]] +// CHECK9-NEXT: [[CONV28:%.*]] = sext i32 [[MUL27]] to i64 +// CHECK9-NEXT: [[MUL29:%.*]] = mul nsw i64 [[DIV24]], [[CONV28]] +// CHECK9-NEXT: [[SUB30:%.*]] = sub nsw i64 [[TMP17]], [[MUL29]] +// CHECK9-NEXT: [[MUL31:%.*]] = mul nsw i64 [[SUB30]], 1 +// CHECK9-NEXT: [[ADD32:%.*]] = add nsw i64 0, [[MUL31]] +// CHECK9-NEXT: [[CONV33:%.*]] = trunc i64 [[ADD32]] to i32 +// CHECK9-NEXT: store i32 [[CONV33]], i32* [[J11]], align 4 +// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK9-NEXT: [[ADD34:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK9-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 +// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK9-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK9-NEXT: [[IDXPROM35:%.*]] = sext i32 [[TMP24]] to i64 +// CHECK9-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM35]] +// CHECK9-NEXT: store i32 [[ADD34]], i32* [[ARRAYIDX36]], align 4 +// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK9: omp.body.continue: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK9-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK9-NEXT: [[ADD37:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK9-NEXT: store i64 [[ADD37]], i64* [[DOTOMP_IV]], align 8 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK9-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK9-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK9-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK9-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK9-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK9-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK9: .execute: +// CHECK9-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK9-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK9-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK9: .omp.deinit: +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK9-NEXT: br label [[DOTEXIT:%.*]] +// CHECK9: .exit: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK9-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK9-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK9-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK9-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK9-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK9-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK9-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK9-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK9-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK9-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK9-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK9-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK9-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK9-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK9-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK9-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK9-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK9-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK9-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK9-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK9-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK9-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK9-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK9-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* +// CHECK9-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK9-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK9-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK9-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK9-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK9-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK9-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK9: cond.true11: +// CHECK9-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: br label [[COND_END13:%.*]] +// CHECK9: cond.false12: +// CHECK9-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END13]] +// CHECK9: cond.end13: +// CHECK9-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] +// CHECK9-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK9-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK9-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK9-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK9-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK9-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK9-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK9-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK9-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK9-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK9-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 +// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK9-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK9-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] +// CHECK9-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 +// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK9: omp.body.continue: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK9-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK10-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK10-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK10-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK10-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK10-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK10-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK10-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK10: .execute: +// CHECK10-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK10-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK10-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK10-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK10-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK10: .omp.deinit: +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK10-NEXT: br label [[DOTEXIT:%.*]] +// CHECK10: .exit: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK10-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK10-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK10-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK10-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK10-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK10-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK10-NEXT: [[TMP2:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 +// CHECK10-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK10-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 +// CHECK10-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i64 0 +// CHECK10-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK10-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK10-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK10-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK10-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK10-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK10: cond.true: +// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK10-NEXT: br label [[COND_END:%.*]] +// CHECK10: cond.false: +// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END]] +// CHECK10: cond.end: +// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK10-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK10-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 +// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK10-NEXT: store i32 [[TMP23]], i32* [[CONV8]], align 4 +// CHECK10-NEXT: [[TMP24:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK10-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK10-NEXT: store i32 [[TMP25]], i32* [[CONV9]], align 4 +// CHECK10-NEXT: [[TMP26:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK10-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK10-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK10-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK10-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK10-NEXT: [[TMP30:%.*]] = inttoptr i64 [[TMP22]] to i8* +// CHECK10-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK10-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK10-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP24]] to i8* +// CHECK10-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 +// CHECK10-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK10-NEXT: [[TMP34:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK10-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 +// CHECK10-NEXT: [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK10-NEXT: [[TMP36:%.*]] = inttoptr i64 [[TMP26]] to i8* +// CHECK10-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 +// CHECK10-NEXT: [[TMP37:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 +// CHECK10-NEXT: [[TMP39:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP38]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP39]], i64 5) +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK10-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK10-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK10-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK10-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP46]], [[TMP47]] +// CHECK10-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK10: cond.true14: +// CHECK10-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK10-NEXT: br label [[COND_END16:%.*]] +// CHECK10: cond.false15: +// CHECK10-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END16]] +// CHECK10: cond.end16: +// CHECK10-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP48]], [[COND_TRUE14]] ], [ [[TMP49]], [[COND_FALSE15]] ] +// CHECK10-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP50]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: [[TMP51:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) +// CHECK10-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 +// CHECK10-NEXT: br i1 [[TMP54]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK10: .omp.lastprivate.then: +// CHECK10-NEXT: [[TMP55:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK10-NEXT: store i32 [[TMP55]], i32* [[CONV1]], align 8 +// CHECK10-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK10: .omp.lastprivate.done: +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: [[TMP56:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK10-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP56]]) +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK10-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK10-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK10-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK10-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK10-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK10-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK10-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK10-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK10-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK10-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK10-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK10: omp.dispatch.cond: +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK10-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK10-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK10-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK10: cond.true: +// CHECK10-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK10-NEXT: br label [[COND_END:%.*]] +// CHECK10: cond.false: +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK10-NEXT: br label [[COND_END]] +// CHECK10: cond.end: +// CHECK10-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK10-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK10-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK10-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK10: omp.dispatch.body: +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK10-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK10-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK10-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK10-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK10: omp.body.continue: +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK10-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK10: omp.dispatch.inc: +// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK10-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK10-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK10: omp.dispatch.end: +// CHECK10-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK10-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK10-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK10: .omp.lastprivate.then: +// CHECK10-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK10-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 +// CHECK10-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK10: .omp.lastprivate.done: +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK10-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK10-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK10-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK10-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK10: .execute: +// CHECK10-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK10-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK10-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK10: .omp.deinit: +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK10-NEXT: br label [[DOTEXIT:%.*]] +// CHECK10: .exit: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK10-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK10-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK10-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK10-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK10: cond.true: +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: br label [[COND_END:%.*]] +// CHECK10: cond.false: +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END]] +// CHECK10: cond.end: +// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK10-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK10-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK10-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK10-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK10-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK10-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK10-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK10-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK10-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK10-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK10-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK10-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK10-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK10-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK10-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK10-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK10-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK10-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK10-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK10-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK10-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK10-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK10: cond.true11: +// CHECK10-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: br label [[COND_END13:%.*]] +// CHECK10: cond.false12: +// CHECK10-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END13]] +// CHECK10: cond.end13: +// CHECK10-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK10-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK10-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK10-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK10-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK10-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK10-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK10-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK10-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK10-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK10-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK10-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK10-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK10-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK10-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK10-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK10-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK10-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK10: omp.body.continue: +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK10-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK10-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK10: .execute: +// CHECK10-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK10-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK10: .omp.deinit: +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK10-NEXT: br label [[DOTEXIT:%.*]] +// CHECK10: .exit: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK10-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK10: cond.true: +// CHECK10-NEXT: br label [[COND_END:%.*]] +// CHECK10: cond.false: +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END]] +// CHECK10: cond.end: +// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK10-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK10-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK10-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK10-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK10-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK10-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK10-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK10-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK10-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK10-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK10-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK10-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK10-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK10-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK10: cond.true5: +// CHECK10-NEXT: br label [[COND_END7:%.*]] +// CHECK10: cond.false6: +// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END7]] +// CHECK10: cond.end7: +// CHECK10-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK10-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK10-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK10-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK10-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK10-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK10-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK10-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK10-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK10-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK10-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK10: omp.body.continue: +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK10-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK10-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK10-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK10-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK10-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK10-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK10: .execute: +// CHECK10-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK10-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK10-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK10: .omp.deinit: +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK10-NEXT: br label [[DOTEXIT:%.*]] +// CHECK10: .exit: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK10-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK10-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK10-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK10-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK10-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK10: cond.true: +// CHECK10-NEXT: br label [[COND_END:%.*]] +// CHECK10: cond.false: +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END]] +// CHECK10: cond.end: +// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK10-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK10-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK10-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK10-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK10-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK10-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK10-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK10-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK10-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK10-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK10-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK10-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK10-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK10-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK10-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK10-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK10-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK10-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK10-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK10: cond.true7: +// CHECK10-NEXT: br label [[COND_END9:%.*]] +// CHECK10: cond.false8: +// CHECK10-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END9]] +// CHECK10: cond.end9: +// CHECK10-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK10-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK10-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK10-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK10-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK10-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK10-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK10-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK10-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK10-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK10-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK10-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK10-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK10-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK10-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK10-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK10-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK10-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK10-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK10-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK10-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK10-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK10: omp.body.continue: +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK10-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK10-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK10-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK10-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK10-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK10: .execute: +// CHECK10-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK10-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK10-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK10: .omp.deinit: +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK10-NEXT: br label [[DOTEXIT:%.*]] +// CHECK10: .exit: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I8:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[J9:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK10-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK10-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK10-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] +// CHECK10-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 +// CHECK10-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK10-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: land.lhs.true: +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK10-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK10-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK10-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP10]], [[TMP11]] +// CHECK10-NEXT: br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK10: cond.true: +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK10-NEXT: br label [[COND_END:%.*]] +// CHECK10: cond.false: +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END]] +// CHECK10: cond.end: +// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP14]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK10-NEXT: [[CMP11:%.*]] = icmp slt i32 [[TMP15]], [[ADD]] +// CHECK10-NEXT: br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP18:%.*]] = zext i32 [[TMP17]] to i64 +// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: [[CONV12:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK10-NEXT: store i32 [[TMP21]], i32* [[CONV12]], align 4 +// CHECK10-NEXT: [[TMP22:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK10-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK10-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* +// CHECK10-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK10-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK10-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK10-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK10-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK10-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP22]] to i8* +// CHECK10-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK10-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK10-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK10-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK10-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK10-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i64 4) +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK10-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK10-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK10-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK10-NEXT: [[CMP16:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK10-NEXT: br i1 [[CMP16]], label [[COND_TRUE17:%.*]], label [[COND_FALSE18:%.*]] +// CHECK10: cond.true17: +// CHECK10-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK10-NEXT: br label [[COND_END19:%.*]] +// CHECK10: cond.false18: +// CHECK10-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END19]] +// CHECK10: cond.end19: +// CHECK10-NEXT: [[COND20:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE17]] ], [ [[TMP43]], [[COND_FALSE18]] ] +// CHECK10-NEXT: store i32 [[COND20]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK10-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK10-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK10-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] +// CHECK10-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 +// CHECK10-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK10-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: land.lhs.true: +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK10-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK10-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK10-NEXT: [[CONV8:%.*]] = trunc i64 [[TMP8]] to i32 +// CHECK10-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK10-NEXT: [[CONV9:%.*]] = trunc i64 [[TMP9]] to i32 +// CHECK10-NEXT: store i32 [[CONV8]], i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[CONV9]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[CONV12:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK10-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK10-NEXT: [[CMP13:%.*]] = icmp ule i64 [[CONV12]], [[TMP14]] +// CHECK10-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK10-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK10-NEXT: [[MUL16:%.*]] = mul nsw i32 1, [[DIV15]] +// CHECK10-NEXT: [[DIV17:%.*]] = sdiv i32 [[TMP15]], [[MUL16]] +// CHECK10-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL18]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[I10]], align 4 +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[SUB19:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK10-NEXT: [[DIV20:%.*]] = sdiv i32 [[SUB19]], 1 +// CHECK10-NEXT: [[MUL21:%.*]] = mul nsw i32 1, [[DIV20]] +// CHECK10-NEXT: [[DIV22:%.*]] = sdiv i32 [[TMP18]], [[MUL21]] +// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[SUB23:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK10-NEXT: [[DIV24:%.*]] = sdiv i32 [[SUB23]], 1 +// CHECK10-NEXT: [[MUL25:%.*]] = mul nsw i32 1, [[DIV24]] +// CHECK10-NEXT: [[MUL26:%.*]] = mul nsw i32 [[DIV22]], [[MUL25]] +// CHECK10-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP17]], [[MUL26]] +// CHECK10-NEXT: [[MUL28:%.*]] = mul nsw i32 [[SUB27]], 1 +// CHECK10-NEXT: [[ADD29:%.*]] = add nsw i32 0, [[MUL28]] +// CHECK10-NEXT: store i32 [[ADD29]], i32* [[J11]], align 4 +// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK10-NEXT: [[ADD30:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK10-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 +// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK10-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK10-NEXT: [[IDXPROM31:%.*]] = sext i32 [[TMP24]] to i64 +// CHECK10-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM31]] +// CHECK10-NEXT: store i32 [[ADD30]], i32* [[ARRAYIDX32]], align 4 +// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK10: omp.body.continue: +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD33:%.*]] = add nsw i32 [[TMP25]], [[TMP26]] +// CHECK10-NEXT: store i32 [[ADD33]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK10-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK10-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK10-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK10-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK10-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK10-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK10: .execute: +// CHECK10-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK10-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK10-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK10: .omp.deinit: +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK10-NEXT: br label [[DOTEXIT:%.*]] +// CHECK10: .exit: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK10-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK10-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK10-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK10-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK10-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK10-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK10: cond.true: +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: br label [[COND_END:%.*]] +// CHECK10: cond.false: +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END]] +// CHECK10: cond.end: +// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK10-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK10-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK10-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK10-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK10-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK10-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK10-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK10-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK10-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK10-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK10-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK10-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK10-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK10-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK10-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK10-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK10-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK10-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK10-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* +// CHECK10-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK10-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK10-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK10-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK10-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK10-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK10-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK10: cond.true11: +// CHECK10-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: br label [[COND_END13:%.*]] +// CHECK10: cond.false12: +// CHECK10-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END13]] +// CHECK10: cond.end13: +// CHECK10-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] +// CHECK10-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK10-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK10-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK10-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK10-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK10-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK10-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK10-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK10-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK10-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK10-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK10-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK10-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK10-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK10-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 +// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] +// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK10-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK10-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] +// CHECK10-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 +// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK10: omp.body.continue: +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK10-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK11-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK11-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK11-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK11-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK11-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK11-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK11-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK11-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK11-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK11-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK11-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) +// CHECK11-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK11-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK11-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK11-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK11-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK11-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK11-NEXT: store i32 [[TMP20]], i32* [[CONV8]], align 4 +// CHECK11-NEXT: [[TMP21:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK11-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK11-NEXT: store i32 [[TMP22]], i32* [[CONV9]], align 4 +// CHECK11-NEXT: [[TMP23:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK11-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK11-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK11-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK11-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK11-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK11-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK11-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK11-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP21]] to i8* +// CHECK11-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 +// CHECK11-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK11-NEXT: [[TMP31:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK11-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 +// CHECK11-NEXT: [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK11-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP23]] to i8* +// CHECK11-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 8 +// CHECK11-NEXT: [[TMP34:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP35:%.*]] = load i32, i32* [[TMP34]], align 4 +// CHECK11-NEXT: [[TMP36:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP35]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP36]], i64 5) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK11-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK11-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP41]], [[TMP42]] +// CHECK11-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK11-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP43]], [[TMP44]] +// CHECK11-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK11: cond.true14: +// CHECK11-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK11-NEXT: br label [[COND_END16:%.*]] +// CHECK11: cond.false15: +// CHECK11-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END16]] +// CHECK11: cond.end16: +// CHECK11-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP45]], [[COND_TRUE14]] ], [ [[TMP46]], [[COND_FALSE15]] ] +// CHECK11-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP47]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP48:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP49:%.*]] = load i32, i32* [[TMP48]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP49]]) +// CHECK11-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0 +// CHECK11-NEXT: br i1 [[TMP51]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK11: .omp.lastprivate.then: +// CHECK11-NEXT: [[TMP52:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK11-NEXT: store i32 [[TMP52]], i32* [[CONV1]], align 8 +// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK11: .omp.lastprivate.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK11-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK11-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK11-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK11-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK11-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK11-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK11-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK11-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK11-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK11: omp.dispatch.cond: +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK11-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK11-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK11-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK11-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK11-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK11-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK11: omp.dispatch.body: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK11-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK11-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK11-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK11-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK11-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK11: omp.dispatch.inc: +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK11-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK11-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK11: omp.dispatch.end: +// CHECK11-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK11-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK11-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK11: .omp.lastprivate.then: +// CHECK11-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK11-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 +// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK11: .omp.lastprivate.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK11-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK11-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK11-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK11-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK11-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK11-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK11-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK11-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK11-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK11-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK11-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK11-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK11-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK11-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK11-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK11-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK11-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK11-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK11-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK11-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK11-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK11-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK11-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK11-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK11: cond.true11: +// CHECK11-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: br label [[COND_END13:%.*]] +// CHECK11: cond.false12: +// CHECK11-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END13]] +// CHECK11: cond.end13: +// CHECK11-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK11-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK11-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK11-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK11-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK11-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK11-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK11-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK11-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK11-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK11-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK11-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK11-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK11-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK11-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK11-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK11-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK11-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK11-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK11-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK11-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK11-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK11-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK11-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK11-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK11-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK11-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK11-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK11-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK11-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK11-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK11-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK11-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK11: cond.true5: +// CHECK11-NEXT: br label [[COND_END7:%.*]] +// CHECK11: cond.false6: +// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END7]] +// CHECK11: cond.end7: +// CHECK11-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK11-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK11-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK11-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK11-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK11-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK11-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK11-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK11-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK11-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK11-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK11-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK11-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK11-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK11-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK11-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK11-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK11-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK11-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK11-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK11-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK11-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK11-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK11-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK11-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK11-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK11-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK11-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK11-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK11-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK11-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK11-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK11-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK11-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK11-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK11-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK11-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK11-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK11-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK11: cond.true7: +// CHECK11-NEXT: br label [[COND_END9:%.*]] +// CHECK11: cond.false8: +// CHECK11-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END9]] +// CHECK11: cond.end9: +// CHECK11-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK11-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK11-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK11-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK11-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK11-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK11-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK11-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK11-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK11-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK11-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK11-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK11-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK11-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK11-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK11-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK11-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK11-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK11-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK11-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK11-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK11-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK11-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I8:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[J9:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK11-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK11-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] +// CHECK11-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 +// CHECK11-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK11-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: land.lhs.true: +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK11-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK11-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK11-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP10]], [[TMP11]] +// CHECK11-NEXT: br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP14]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK11-NEXT: [[CMP11:%.*]] = icmp slt i32 [[TMP15]], [[ADD]] +// CHECK11-NEXT: br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = zext i32 [[TMP17]] to i64 +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: [[CONV12:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK11-NEXT: store i32 [[TMP21]], i32* [[CONV12]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK11-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK11-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* +// CHECK11-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK11-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK11-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK11-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK11-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK11-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP22]] to i8* +// CHECK11-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK11-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK11-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK11-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK11-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK11-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i64 4) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK11-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK11-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK11-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK11-NEXT: [[CMP16:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK11-NEXT: br i1 [[CMP16]], label [[COND_TRUE17:%.*]], label [[COND_FALSE18:%.*]] +// CHECK11: cond.true17: +// CHECK11-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK11-NEXT: br label [[COND_END19:%.*]] +// CHECK11: cond.false18: +// CHECK11-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END19]] +// CHECK11: cond.end19: +// CHECK11-NEXT: [[COND20:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE17]] ], [ [[TMP43]], [[COND_FALSE18]] ] +// CHECK11-NEXT: store i32 [[COND20]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK11-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK11-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] +// CHECK11-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 +// CHECK11-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK11-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: land.lhs.true: +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK11-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK11-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK11-NEXT: [[CONV8:%.*]] = trunc i64 [[TMP8]] to i32 +// CHECK11-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK11-NEXT: [[CONV9:%.*]] = trunc i64 [[TMP9]] to i32 +// CHECK11-NEXT: store i32 [[CONV8]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[CONV9]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[CONV12:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK11-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK11-NEXT: [[CMP13:%.*]] = icmp ule i64 [[CONV12]], [[TMP14]] +// CHECK11-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK11-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK11-NEXT: [[MUL16:%.*]] = mul nsw i32 1, [[DIV15]] +// CHECK11-NEXT: [[DIV17:%.*]] = sdiv i32 [[TMP15]], [[MUL16]] +// CHECK11-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL18]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I10]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[SUB19:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK11-NEXT: [[DIV20:%.*]] = sdiv i32 [[SUB19]], 1 +// CHECK11-NEXT: [[MUL21:%.*]] = mul nsw i32 1, [[DIV20]] +// CHECK11-NEXT: [[DIV22:%.*]] = sdiv i32 [[TMP18]], [[MUL21]] +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[SUB23:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK11-NEXT: [[DIV24:%.*]] = sdiv i32 [[SUB23]], 1 +// CHECK11-NEXT: [[MUL25:%.*]] = mul nsw i32 1, [[DIV24]] +// CHECK11-NEXT: [[MUL26:%.*]] = mul nsw i32 [[DIV22]], [[MUL25]] +// CHECK11-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP17]], [[MUL26]] +// CHECK11-NEXT: [[MUL28:%.*]] = mul nsw i32 [[SUB27]], 1 +// CHECK11-NEXT: [[ADD29:%.*]] = add nsw i32 0, [[MUL28]] +// CHECK11-NEXT: store i32 [[ADD29]], i32* [[J11]], align 4 +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK11-NEXT: [[ADD30:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK11-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK11-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK11-NEXT: [[IDXPROM31:%.*]] = sext i32 [[TMP24]] to i64 +// CHECK11-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM31]] +// CHECK11-NEXT: store i32 [[ADD30]], i32* [[ARRAYIDX32]], align 4 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD33:%.*]] = add nsw i32 [[TMP25]], [[TMP26]] +// CHECK11-NEXT: store i32 [[ADD33]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK11-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK11-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK11-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK11-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK11-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK11-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK11-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK11-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK11-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK11-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK11-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK11-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK11-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK11-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK11-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK11-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK11-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK11-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK11-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK11-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK11-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK11-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK11-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK11-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* +// CHECK11-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK11-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK11-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK11-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK11-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK11: cond.true11: +// CHECK11-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: br label [[COND_END13:%.*]] +// CHECK11: cond.false12: +// CHECK11-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END13]] +// CHECK11: cond.end13: +// CHECK11-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] +// CHECK11-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK11-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK11-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK11-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK11-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK11-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK11-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK11-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK11-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK11-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK11-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK11-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK11-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK11-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK11-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] +// CHECK11-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK11-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK12-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK12-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK12-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK12-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK12-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 +// CHECK12-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK12-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK12-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK12-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK12-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK12-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK12-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK12-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK12-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK12-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK12-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK12-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK12-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK12-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK12-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK12-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* +// CHECK12-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK12-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +// CHECK12-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK12-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] +// CHECK12-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK12: cond.true11: +// CHECK12-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: br label [[COND_END13:%.*]] +// CHECK12: cond.false12: +// CHECK12-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END13]] +// CHECK12: cond.end13: +// CHECK12-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] +// CHECK12-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) +// CHECK12-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK12-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK12: .omp.lastprivate.then: +// CHECK12-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK12: .omp.lastprivate.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: [[TMP54:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK12-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP54]]) +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK12-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK12-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK12: omp.dispatch.cond: +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK12-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK12: omp.dispatch.body: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK12-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK12-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK12-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK12: omp.dispatch.inc: +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK12-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK12: omp.dispatch.end: +// CHECK12-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK12-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK12-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK12: .omp.lastprivate.then: +// CHECK12-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK12: .omp.lastprivate.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK12-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK12-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK12-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK12-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK12-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK12-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK12-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK12-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK12-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK12-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK12-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK12-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK12: cond.true10: +// CHECK12-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: br label [[COND_END12:%.*]] +// CHECK12: cond.false11: +// CHECK12-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END12]] +// CHECK12: cond.end12: +// CHECK12-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK12-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK12-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK12-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK12-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK12-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK12-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK12-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK12-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK12-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK12-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK12-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK12-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK12-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK12-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK12-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK12: cond.true5: +// CHECK12-NEXT: br label [[COND_END7:%.*]] +// CHECK12: cond.false6: +// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END7]] +// CHECK12: cond.end7: +// CHECK12-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK12-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK12-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK12-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK12-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK12-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK12-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK12-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK12-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK12-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK12-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK12-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK12-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK12-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK12-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK12-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK12-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK12-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK12-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK12: cond.true6: +// CHECK12-NEXT: br label [[COND_END8:%.*]] +// CHECK12: cond.false7: +// CHECK12-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END8]] +// CHECK12: cond.end8: +// CHECK12-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK12-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK12-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK12-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK12-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK12-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK12-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK12-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK12-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK12-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK12-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[J10:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK12-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK12-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK12-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK12-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK12-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: land.lhs.true: +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK12-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK12-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK12-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK12-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK12-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK12-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK12-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK12-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK12-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK12-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK12-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK12-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK12-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK12-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK12-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK12-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK12-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK12-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK12-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK12-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK12-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK12-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK12-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK12-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK12-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK12-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK12-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK12-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK12-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK12-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK12-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK12-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK12-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK12-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK12-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK12-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK12-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK12-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK12-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK12-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK12-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK12: cond.true18: +// CHECK12-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK12-NEXT: br label [[COND_END20:%.*]] +// CHECK12: cond.false19: +// CHECK12-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK12-NEXT: br label [[COND_END20]] +// CHECK12: cond.end20: +// CHECK12-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK12-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK12-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK12-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[J12:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK12-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK12-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK12-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK12-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK12-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: land.lhs.true: +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK12-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK12-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK12-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK12-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK12-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK12-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK12-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK12-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK12-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK12-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK12-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK12-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK12-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK12-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK12-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK12-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK12-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK12-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK12-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK12-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK12-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK12-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK12-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK12-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK12-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK12-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK12-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK12-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK12-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK12-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK12-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK12-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK12-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK12-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK12-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK12-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK12-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK12-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK12-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK12-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK12-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK12-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK12-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK12-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK12-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK12-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK12-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK12-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK12-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK12-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK12-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK12-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK12-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK12-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK12-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK12-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK12-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] +// CHECK12-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK12: cond.true10: +// CHECK12-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: br label [[COND_END12:%.*]] +// CHECK12: cond.false11: +// CHECK12-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END12]] +// CHECK12: cond.end12: +// CHECK12-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] +// CHECK12-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK12-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK12-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK12-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK12-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK12-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: ret void +// +// +// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK13-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK13-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK13-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK13: .execute: +// CHECK13-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK13-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK13-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK13: .omp.deinit: +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK13-NEXT: br label [[DOTEXIT:%.*]] +// CHECK13: .exit: +// CHECK13-NEXT: ret void +// +// +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK13-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK13-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK13-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK13-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 +// CHECK13-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK13-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK13-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK13-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK13: omp.precond.then: +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK13-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK13: cond.true: +// CHECK13-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: br label [[COND_END:%.*]] +// CHECK13: cond.false: +// CHECK13-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END]] +// CHECK13: cond.end: +// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK13-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK13-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK13-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 +// CHECK13-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK13-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK13-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK13-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK13-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK13-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK13-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK13-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK13-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK13-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK13-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK13-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK13-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK13-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK13-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* +// CHECK13-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK13-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +// CHECK13-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK13-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK13-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK13-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK13-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] +// CHECK13-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK13: cond.true11: +// CHECK13-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: br label [[COND_END13:%.*]] +// CHECK13: cond.false12: +// CHECK13-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END13]] +// CHECK13: cond.end13: +// CHECK13-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] +// CHECK13-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK13: omp.loop.exit: +// CHECK13-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) +// CHECK13-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK13-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK13: .omp.lastprivate.then: +// CHECK13-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK13: .omp.lastprivate.done: +// CHECK13-NEXT: br label [[OMP_PRECOND_END]] +// CHECK13: omp.precond.end: +// CHECK13-NEXT: [[TMP54:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK13-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP54]]) +// CHECK13-NEXT: ret void +// +// +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK13-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK13-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK13-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK13: omp.precond.then: +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK13-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK13: omp.dispatch.cond: +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK13-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK13: cond.true: +// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: br label [[COND_END:%.*]] +// CHECK13: cond.false: +// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END]] +// CHECK13: cond.end: +// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK13-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK13-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK13: omp.dispatch.body: +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK13-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK13-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK13-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK13-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK13-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK13-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK13: omp.body.continue: +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK13-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK13: omp.dispatch.inc: +// CHECK13-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK13-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK13-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK13: omp.dispatch.end: +// CHECK13-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK13-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK13-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK13: .omp.lastprivate.then: +// CHECK13-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK13: .omp.lastprivate.done: +// CHECK13-NEXT: br label [[OMP_PRECOND_END]] +// CHECK13: omp.precond.end: +// CHECK13-NEXT: ret void +// +// +// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK13-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK13-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK13: .execute: +// CHECK13-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK13-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK13-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK13: .omp.deinit: +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK13-NEXT: br label [[DOTEXIT:%.*]] +// CHECK13: .exit: +// CHECK13-NEXT: ret void +// +// +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK13-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK13-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK13: omp.precond.then: +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK13-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK13-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK13-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK13: cond.true: +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: br label [[COND_END:%.*]] +// CHECK13: cond.false: +// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END]] +// CHECK13: cond.end: +// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK13-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK13-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK13-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK13-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK13-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK13-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK13-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK13-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK13-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK13-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK13-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK13-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK13-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK13-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK13-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK13-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK13-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK13-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK13-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK13-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK13-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK13: cond.true10: +// CHECK13-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: br label [[COND_END12:%.*]] +// CHECK13: cond.false11: +// CHECK13-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END12]] +// CHECK13: cond.end12: +// CHECK13-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK13-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK13: omp.loop.exit: +// CHECK13-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK13-NEXT: br label [[OMP_PRECOND_END]] +// CHECK13: omp.precond.end: +// CHECK13-NEXT: ret void +// +// +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK13-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK13-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK13: omp.precond.then: +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK13-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK13-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK13-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK13-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK13-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK13-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK13-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK13-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK13: omp.body.continue: +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK13-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK13: omp.loop.exit: +// CHECK13-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK13-NEXT: br label [[OMP_PRECOND_END]] +// CHECK13: omp.precond.end: +// CHECK13-NEXT: ret void +// +// +// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK13-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK13-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK13-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK13-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK13: .execute: +// CHECK13-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK13-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK13-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK13: .omp.deinit: +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK13-NEXT: br label [[DOTEXIT:%.*]] +// CHECK13: .exit: +// CHECK13-NEXT: ret void +// +// +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK13-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK13-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK13: cond.true: +// CHECK13-NEXT: br label [[COND_END:%.*]] +// CHECK13: cond.false: +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END]] +// CHECK13: cond.end: +// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK13-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK13-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK13-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK13-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK13-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK13-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK13-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK13-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK13-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK13-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK13-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK13-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK13-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK13-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK13-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK13-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK13: cond.true5: +// CHECK13-NEXT: br label [[COND_END7:%.*]] +// CHECK13: cond.false6: +// CHECK13-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END7]] +// CHECK13: cond.end7: +// CHECK13-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK13-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK13: omp.loop.exit: +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK13-NEXT: ret void +// +// +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK13-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK13-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK13-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK13: omp.body.continue: +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK13-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK13: omp.loop.exit: +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK13-NEXT: ret void +// +// +// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK13-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK13-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK13-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK13-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK13: .execute: +// CHECK13-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK13-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK13-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK13: .omp.deinit: +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK13-NEXT: br label [[DOTEXIT:%.*]] +// CHECK13: .exit: +// CHECK13-NEXT: ret void +// +// +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK13-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK13-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK13-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK13: cond.true: +// CHECK13-NEXT: br label [[COND_END:%.*]] +// CHECK13: cond.false: +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END]] +// CHECK13: cond.end: +// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK13-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK13-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK13-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK13-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK13-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK13-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK13-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK13-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK13-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK13-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK13-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK13-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK13-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK13-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK13-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK13-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK13-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK13-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK13-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK13-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK13: cond.true6: +// CHECK13-NEXT: br label [[COND_END8:%.*]] +// CHECK13: cond.false7: +// CHECK13-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END8]] +// CHECK13: cond.end8: +// CHECK13-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK13-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK13: omp.loop.exit: +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK13-NEXT: ret void +// +// +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK13-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK13-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK13-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK13-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK13-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK13-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK13-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK13-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK13-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK13-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK13-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK13-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK13-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK13-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK13-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK13: omp.body.continue: +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK13-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK13: omp.loop.exit: +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK13-NEXT: ret void +// +// +// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK13-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK13-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK13: .execute: +// CHECK13-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK13-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK13-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK13: .omp.deinit: +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK13-NEXT: br label [[DOTEXIT:%.*]] +// CHECK13: .exit: +// CHECK13-NEXT: ret void +// +// +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK13-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[J10:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK13-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK13-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK13-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK13-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK13-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK13-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK13-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK13: land.lhs.true: +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK13-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK13: omp.precond.then: +// CHECK13-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK13-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK13-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK13-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK13-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK13-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK13-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK13-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK13-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK13-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK13: cond.true: +// CHECK13-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK13-NEXT: br label [[COND_END:%.*]] +// CHECK13: cond.false: +// CHECK13-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK13-NEXT: br label [[COND_END]] +// CHECK13: cond.end: +// CHECK13-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK13-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK13-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK13-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK13-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK13-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK13-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK13-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK13-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +// CHECK13-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK13-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK13-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK13-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK13-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK13-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK13-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK13-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK13-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK13-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK13-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK13-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK13-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK13-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK13-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK13-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK13-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK13-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK13-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK13-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK13-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK13-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK13-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK13-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK13-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK13-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK13-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK13-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK13: cond.true18: +// CHECK13-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK13-NEXT: br label [[COND_END20:%.*]] +// CHECK13: cond.false19: +// CHECK13-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK13-NEXT: br label [[COND_END20]] +// CHECK13: cond.end20: +// CHECK13-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK13-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK13-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK13-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK13: omp.loop.exit: +// CHECK13-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK13-NEXT: br label [[OMP_PRECOND_END]] +// CHECK13: omp.precond.end: +// CHECK13-NEXT: ret void +// +// +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK13-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[J12:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK13-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK13-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK13-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK13-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK13-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK13-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK13-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK13: land.lhs.true: +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK13-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK13: omp.precond.then: +// CHECK13-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK13-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK13-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK13-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK13-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK13-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK13-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK13-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK13-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK13-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK13-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK13-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK13-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK13-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK13-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK13-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK13-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK13-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK13-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK13-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK13-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK13-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK13-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK13-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK13-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK13-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK13-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK13-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK13-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK13-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK13-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK13-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK13-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK13-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK13-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK13-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK13-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK13-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK13-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 +// CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK13: omp.body.continue: +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK13-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK13-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK13: omp.loop.exit: +// CHECK13-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK13-NEXT: br label [[OMP_PRECOND_END]] +// CHECK13: omp.precond.end: +// CHECK13-NEXT: ret void +// +// +// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK13-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK13-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK13-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK13: .execute: +// CHECK13-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK13-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK13-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK13: .omp.deinit: +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK13-NEXT: br label [[DOTEXIT:%.*]] +// CHECK13: .exit: +// CHECK13-NEXT: ret void +// +// +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK13-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK13-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK13-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK13: omp.precond.then: +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK13-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK13-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK13-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK13: cond.true: +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: br label [[COND_END:%.*]] +// CHECK13: cond.false: +// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END]] +// CHECK13: cond.end: +// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK13-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK13-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK13-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK13-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK13-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK13-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK13-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK13-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK13-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK13-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK13-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK13-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK13-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK13-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK13-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK13-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK13-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK13-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK13-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK13-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK13-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK13-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK13-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK13-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] +// CHECK13-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK13: cond.true10: +// CHECK13-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: br label [[COND_END12:%.*]] +// CHECK13: cond.false11: +// CHECK13-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END12]] +// CHECK13: cond.end12: +// CHECK13-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] +// CHECK13-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK13: omp.loop.exit: +// CHECK13-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK13-NEXT: br label [[OMP_PRECOND_END]] +// CHECK13: omp.precond.end: +// CHECK13-NEXT: ret void +// +// +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK13-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK13-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK13-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK13: omp.precond.then: +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK13-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK13-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK13-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK13-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK13-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK13-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK13-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 +// CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK13: omp.body.continue: +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK13-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK13: omp.loop.exit: +// CHECK13-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK13-NEXT: br label [[OMP_PRECOND_END]] +// CHECK13: omp.precond.end: +// CHECK13-NEXT: ret void +// +// +// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK14-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK14-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK14: .execute: +// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK14-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK14: .omp.deinit: +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK14-NEXT: br label [[DOTEXIT:%.*]] +// CHECK14: .exit: +// CHECK14-NEXT: ret void +// +// +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK14-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK14-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK14-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK14-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK14-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14: cond.true: +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: br label [[COND_END:%.*]] +// CHECK14: cond.false: +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END]] +// CHECK14: cond.end: +// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK14-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK14-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 +// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK14-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* +// CHECK14-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK14-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK14-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK14-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK14-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK14-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK14-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK14-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK14-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK14-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK14-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK14-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK14-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK14-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK14-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK14-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK14-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK14-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK14: cond.true11: +// CHECK14-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: br label [[COND_END13:%.*]] +// CHECK14: cond.false12: +// CHECK14-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END13]] +// CHECK14: cond.end13: +// CHECK14-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] +// CHECK14-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK14-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK14-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK14: .omp.lastprivate.then: +// CHECK14-NEXT: [[TMP50:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP50]], i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK14: .omp.lastprivate.done: +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK14-NEXT: ret void +// +// +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK14-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK14-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK14: omp.dispatch.cond: +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK14-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14: cond.true: +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: br label [[COND_END:%.*]] +// CHECK14: cond.false: +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END]] +// CHECK14: cond.end: +// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK14-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK14: omp.dispatch.body: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK14-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK14-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK14-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK14: omp.body.continue: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK14: omp.dispatch.inc: +// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK14-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK14-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK14: omp.dispatch.end: +// CHECK14-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK14-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK14-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK14: .omp.lastprivate.then: +// CHECK14-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK14: .omp.lastprivate.done: +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: ret void +// +// +// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK14-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK14: .execute: +// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK14-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK14: .omp.deinit: +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK14-NEXT: br label [[DOTEXIT:%.*]] +// CHECK14: .exit: +// CHECK14-NEXT: ret void +// +// +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK14-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14: cond.true: +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: br label [[COND_END:%.*]] +// CHECK14: cond.false: +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END]] +// CHECK14: cond.end: +// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK14-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK14-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK14-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK14-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK14-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK14-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK14-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK14-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK14-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK14-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK14-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK14-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK14-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK14-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK14-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK14-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK14-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK14: cond.true10: +// CHECK14-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: br label [[COND_END12:%.*]] +// CHECK14: cond.false11: +// CHECK14-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END12]] +// CHECK14: cond.end12: +// CHECK14-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK14-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: ret void +// +// +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK14-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK14-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK14-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK14-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK14-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK14-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK14: omp.body.continue: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: ret void +// +// +// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK14-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK14-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK14: .execute: +// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK14-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK14: .omp.deinit: +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK14-NEXT: br label [[DOTEXIT:%.*]] +// CHECK14: .exit: +// CHECK14-NEXT: ret void +// +// +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK14-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14: cond.true: +// CHECK14-NEXT: br label [[COND_END:%.*]] +// CHECK14: cond.false: +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END]] +// CHECK14: cond.end: +// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK14-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK14-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK14-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK14-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK14-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK14-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK14-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK14-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK14-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK14-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK14: cond.true5: +// CHECK14-NEXT: br label [[COND_END7:%.*]] +// CHECK14: cond.false6: +// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END7]] +// CHECK14: cond.end7: +// CHECK14-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK14-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK14-NEXT: ret void +// +// +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK14-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK14-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK14: omp.body.continue: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK14-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK14-NEXT: ret void +// +// +// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK14-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK14-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK14: .execute: +// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK14-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK14: .omp.deinit: +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK14-NEXT: br label [[DOTEXIT:%.*]] +// CHECK14: .exit: +// CHECK14-NEXT: ret void +// +// +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK14-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK14-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14: cond.true: +// CHECK14-NEXT: br label [[COND_END:%.*]] +// CHECK14: cond.false: +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END]] +// CHECK14: cond.end: +// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK14-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK14-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK14-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK14-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK14-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK14-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK14-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK14-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK14-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK14-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK14-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK14-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK14-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK14: cond.true6: +// CHECK14-NEXT: br label [[COND_END8:%.*]] +// CHECK14: cond.false7: +// CHECK14-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END8]] +// CHECK14: cond.end8: +// CHECK14-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK14-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK14-NEXT: ret void +// +// +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK14-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK14-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK14-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK14-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK14-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK14-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK14-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK14-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK14-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK14: omp.body.continue: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK14-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK14-NEXT: ret void +// +// +// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK14-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK14: .execute: +// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK14-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK14: .omp.deinit: +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK14-NEXT: br label [[DOTEXIT:%.*]] +// CHECK14: .exit: +// CHECK14-NEXT: ret void +// +// +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[J10:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK14-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK14-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK14-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK14-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK14-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: land.lhs.true: +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK14-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK14-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK14-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK14-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14: cond.true: +// CHECK14-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: br label [[COND_END:%.*]] +// CHECK14: cond.false: +// CHECK14-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: br label [[COND_END]] +// CHECK14: cond.end: +// CHECK14-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK14-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK14-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK14-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK14-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK14-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK14-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK14-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK14-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK14-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK14-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK14-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK14-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK14-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK14-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK14-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK14-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK14-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK14-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK14-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK14-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK14-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK14-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK14-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK14-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK14-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK14-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK14: cond.true18: +// CHECK14-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: br label [[COND_END20:%.*]] +// CHECK14: cond.false19: +// CHECK14-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: br label [[COND_END20]] +// CHECK14: cond.end20: +// CHECK14-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK14-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK14-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: ret void +// +// +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[J12:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK14-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK14-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK14-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK14-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK14-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: land.lhs.true: +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK14-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK14-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK14-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK14-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK14-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK14-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK14-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK14-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK14-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK14-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK14-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK14-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK14-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK14-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK14-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK14-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK14-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK14-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK14-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK14-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK14-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK14-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK14-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK14-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK14-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK14-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK14-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK14-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK14-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK14-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK14-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK14-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 +// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK14: omp.body.continue: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK14-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK14-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: ret void +// +// +// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK14-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK14-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK14: .execute: +// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK14-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK14: .omp.deinit: +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK14-NEXT: br label [[DOTEXIT:%.*]] +// CHECK14: .exit: +// CHECK14-NEXT: ret void +// +// +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK14-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK14-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14: cond.true: +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: br label [[COND_END:%.*]] +// CHECK14: cond.false: +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END]] +// CHECK14: cond.end: +// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK14-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK14-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK14-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK14-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK14-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK14-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK14-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK14-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK14-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK14-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK14-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK14-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK14-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK14-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK14-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK14-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK14-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK14-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK14-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK14-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] +// CHECK14-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK14: cond.true10: +// CHECK14-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: br label [[COND_END12:%.*]] +// CHECK14: cond.false11: +// CHECK14-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END12]] +// CHECK14: cond.end12: +// CHECK14-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] +// CHECK14-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: ret void +// +// +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK14-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK14-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK14-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK14-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 +// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK14: omp.body.continue: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK14-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: ret void +// +// +// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK15-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK15-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK15: .execute: +// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK15-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK15: .omp.deinit: +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK15-NEXT: br label [[DOTEXIT:%.*]] +// CHECK15: .exit: +// CHECK15-NEXT: ret void +// +// +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK15-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK15-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK15-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK15-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK15-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK15-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK15-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 +// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK15-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* +// CHECK15-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK15-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK15-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK15-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK15-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK15-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK15-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK15-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK15-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK15-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK15-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK15-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK15-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK15-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK15-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK15-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK15-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK15: cond.true11: +// CHECK15-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: br label [[COND_END13:%.*]] +// CHECK15: cond.false12: +// CHECK15-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END13]] +// CHECK15: cond.end13: +// CHECK15-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] +// CHECK15-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK15-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK15-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK15: .omp.lastprivate.then: +// CHECK15-NEXT: [[TMP50:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP50]], i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK15: .omp.lastprivate.done: +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK15-NEXT: ret void +// +// +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK15-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK15-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK15: omp.dispatch.cond: +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK15-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK15: omp.dispatch.body: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK15-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK15-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK15-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK15: omp.body.continue: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK15: omp.dispatch.inc: +// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK15-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK15: omp.dispatch.end: +// CHECK15-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK15-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK15-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK15: .omp.lastprivate.then: +// CHECK15-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK15: .omp.lastprivate.done: +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// +// +// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK15-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK15: .execute: +// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK15-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK15: .omp.deinit: +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK15-NEXT: br label [[DOTEXIT:%.*]] +// CHECK15: .exit: +// CHECK15-NEXT: ret void +// +// +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK15-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK15-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK15-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK15-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK15-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK15-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK15-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK15-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK15-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK15-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK15-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK15-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK15-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK15-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK15-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK15-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK15: cond.true10: +// CHECK15-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: br label [[COND_END12:%.*]] +// CHECK15: cond.false11: +// CHECK15-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END12]] +// CHECK15: cond.end12: +// CHECK15-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK15-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// +// +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK15-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK15-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK15-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK15-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK15-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK15-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK15: omp.body.continue: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// +// +// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK15-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK15-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK15: .execute: +// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK15-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK15: .omp.deinit: +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK15-NEXT: br label [[DOTEXIT:%.*]] +// CHECK15: .exit: +// CHECK15-NEXT: ret void +// +// +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK15-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK15-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK15-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK15-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK15-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK15-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK15-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK15-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK15-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK15-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK15: cond.true5: +// CHECK15-NEXT: br label [[COND_END7:%.*]] +// CHECK15: cond.false6: +// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END7]] +// CHECK15: cond.end7: +// CHECK15-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK15-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK15-NEXT: ret void +// +// +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK15-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK15-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK15: omp.body.continue: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK15-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK15-NEXT: ret void +// +// +// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK15-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK15-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK15: .execute: +// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK15-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK15: .omp.deinit: +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK15-NEXT: br label [[DOTEXIT:%.*]] +// CHECK15: .exit: +// CHECK15-NEXT: ret void +// +// +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK15-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK15-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK15-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK15-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK15-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK15-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK15-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK15-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK15-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK15-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK15-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK15-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK15-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK15-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK15-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK15: cond.true6: +// CHECK15-NEXT: br label [[COND_END8:%.*]] +// CHECK15: cond.false7: +// CHECK15-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END8]] +// CHECK15: cond.end8: +// CHECK15-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK15-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK15-NEXT: ret void +// +// +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK15-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK15-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK15-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK15-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK15-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK15-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK15-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK15-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK15-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK15: omp.body.continue: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK15-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK15-NEXT: ret void +// +// +// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK15-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK15: .execute: +// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK15-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK15: .omp.deinit: +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK15-NEXT: br label [[DOTEXIT:%.*]] +// CHECK15: .exit: +// CHECK15-NEXT: ret void +// +// +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[J10:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK15-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK15-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK15-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK15-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK15-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: land.lhs.true: +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK15-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK15-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK15-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK15-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK15-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK15-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK15-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK15-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK15-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK15-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK15-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK15-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK15-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK15-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK15-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK15-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK15-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK15-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK15-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK15-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK15-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK15-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK15-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK15-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK15-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK15-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK15-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK15-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK15-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK15-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK15: cond.true18: +// CHECK15-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: br label [[COND_END20:%.*]] +// CHECK15: cond.false19: +// CHECK15-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: br label [[COND_END20]] +// CHECK15: cond.end20: +// CHECK15-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK15-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK15-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// +// +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[J12:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK15-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK15-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK15-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK15-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK15-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: land.lhs.true: +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK15-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK15-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK15-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK15-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK15-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK15-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK15-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK15-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK15-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK15-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK15-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK15-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK15-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK15-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK15-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK15-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK15-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK15-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK15-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK15-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK15-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK15-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK15-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK15-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK15-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK15-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK15-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK15-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK15-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK15-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK15-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK15-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK15: omp.body.continue: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK15-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK15-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// +// +// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK15-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK15-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK15: .execute: +// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK15-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK15: .omp.deinit: +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK15-NEXT: br label [[DOTEXIT:%.*]] +// CHECK15: .exit: +// CHECK15-NEXT: ret void +// +// +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK15-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK15-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK15-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK15-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK15-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK15-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK15-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK15-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK15-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK15-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK15-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK15-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK15-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK15-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK15-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK15-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK15-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK15-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK15-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK15-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] +// CHECK15-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK15: cond.true10: +// CHECK15-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: br label [[COND_END12:%.*]] +// CHECK15: cond.false11: +// CHECK15-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END12]] +// CHECK15: cond.end12: +// CHECK15-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] +// CHECK15-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// +// +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK15-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK15-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK15-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK15-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK15: omp.body.continue: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK15-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp @@ -1,15 +1,16 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK1 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK3 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK4 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK5 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK6 // expected-no-diagnostics #ifndef HEADER @@ -26,48 +27,1814 @@ return 0; } -// CHECK: @__omp_offloading_{{.*}}_main_[[LINE:l.+]]_exec_mode = weak constant i8 0 - -// CHECK: define weak void @__omp_offloading_{{.*}}_main_[[LINE]](i{{64|32}} %{{[^,].*}}, i32* nonnull align {{[0-9]+}} dereferenceable{{[^,]*}}, i{{64|32}} %{{[^,)]*}}) -// CHECK: call void @__kmpc_spmd_kernel_init( -// CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @ -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) - -// CHECK: call void @__kmpc_for_static_init_4( - -// CHECK: call void [[PARALLEL:@.+]](i32* %{{.*}}, i32* %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.*}}, i{{64|32}} %{{.*}}, i32* %{{.*}}) -// CHECK: br label % - - -// CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @ - -// CHECK: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} [[ARGC:%.+]], i32* nonnull align {{[0-9]+}} dereferenceable{{.*}}) -// CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack( -// CHECK: alloca i{{[0-9]+}}, -// CHECK: alloca i{{[0-9]+}}, -// CHECK: [[ARGC_ADDR:%.+]] = alloca i{{32|64}}, -// CHECK: alloca i{{[0-9]+}}, -// CHECK: alloca i{{[0-9]+}}, -// CHECK: alloca i{{[0-9]+}}, -// CHECK: alloca i{{[0-9]+}}, -// CHECK: alloca i{{[0-9]+}}, -// CHECK: alloca i{{[0-9]+}}, -// CHECK: alloca i{{[0-9]+}}, -// CHECK: alloca i{{[0-9]+}}, -// CHECK: alloca i{{[0-9]+}}, -// CHECK: alloca i{{[0-9]+}}, -// CHECK: [[I:%.+]] = alloca i32, -// CHECK-32: store i32 [[ARGC]], i32* [[ARGC_ADDR]], -// CHECK-64: store i{{64|32}} [[ARGC]], i{{64|32}}* [[ARGC_ADDR]], -// CHECK-64: [[ARGC:%.+]] = bitcast i64* [[ARGC_ADDR]] to i32* - -// CHECK: call void @__kmpc_for_static_init_4( -// CHECK: call i32 [[FOO:@.+foo.+]](i32* [[I]]) -// CHECK: call i32 [[FOO]](i32* %{{.+}}) -// CHECK-32: call i32 [[FOO]](i32* [[ARGC_ADDR]]) -// CHECK-64: call i32 [[FOO]](i32* [[ARGC]]) -// CHECK: call void @__kmpc_for_static_fini( - -// CHECK-NOT: call void @__kmpc_data_sharing_pop_stack( #endif +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24 +// CHECK1-SAME: (i64 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[ARGC_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], i64* [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[ARGC_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR__CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], i32* [[TMP0]], i64 [[TMP5]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[ARGC_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], i64* [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32* +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK1-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV8:%.*]] = bitcast i64* [[ARGC_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV8]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[CONV9:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP20]], i32* [[CONV9]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR__CASTED]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK1-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK1-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK1-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK1-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP21]] to i8* +// CHECK1-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 +// CHECK1-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK1-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, i32*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i64 5) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK1-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK1-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK1-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK1: cond.true14: +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: br label [[COND_END16:%.*]] +// CHECK1: cond.false15: +// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END16]] +// CHECK1: cond.end16: +// CHECK1-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE14]] ], [ [[TMP44]], [[COND_FALSE15]] ] +// CHECK1-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I7:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], i64* [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32* +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV6:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK1-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV6]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP9]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[TMP7]]) +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK1: omp.dispatch.cond: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CONV8:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP9:%.*]] = icmp ugt i64 [[CONV8]], [[TMP11]] +// CHECK1-NEXT: br i1 [[CMP9]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CONV10:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[CONV10]], [[COND_FALSE]] ] +// CHECK1-NEXT: [[CONV11:%.*]] = trunc i64 [[COND]] to i32 +// CHECK1-NEXT: store i32 [[CONV11]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP14]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]] +// CHECK1-NEXT: br i1 [[CMP12]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK1: omp.dispatch.body: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP13:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]] +// CHECK1-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP19]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I7]], align 4 +// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I7]]) #[[ATTR4:[0-9]+]] +// CHECK1-NEXT: [[CALL14:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR4]] +// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i32 [[CALL]], [[CALL14]] +// CHECK1-NEXT: [[CALL16:%.*]] = call i32 @_Z3fooPi(i32* [[CONV]]) #[[ATTR4]] +// CHECK1-NEXT: [[ADD17:%.*]] = add nsw i32 [[ADD15]], [[CALL16]] +// CHECK1-NEXT: store i32 [[ADD17]], i32* [[TMP0]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[ADD18:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK1-NEXT: store i32 [[ADD18]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK1: omp.dispatch.inc: +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD19:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK1-NEXT: store i32 [[ADD19]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD20:%.*]] = add nsw i32 [[TMP23]], [[TMP24]] +// CHECK1-NEXT: store i32 [[ADD20]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK1: omp.dispatch.end: +// CHECK1-NEXT: [[TMP25:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP26]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24 +// CHECK2-SAME: (i32 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[ARGC_CASTED]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], i32* [[TMP0]], i32 [[TMP5]]) #[[ATTR3:[0-9]+]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK2-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK2-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP16]], i32* [[ARGC_CASTED]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP18]], i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK2-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK2-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK2-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK2-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK2-NEXT: [[TMP30:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 +// CHECK2-NEXT: [[TMP32:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP31]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, i32*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP32]], i32 5) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]] +// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK2: cond.true11: +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: br label [[COND_END13:%.*]] +// CHECK2: cond.false12: +// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END13]] +// CHECK2: cond.end13: +// CHECK2-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ] +// CHECK2-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP43]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP44:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP45]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP9]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[TMP7]]) +// CHECK2-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK2: omp.dispatch.cond: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: [[CMP5:%.*]] = icmp ugt i32 [[TMP10]], [[TMP11]] +// CHECK2-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP14]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]] +// CHECK2-NEXT: br i1 [[CMP6]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK2: omp.dispatch.body: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP19]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I4]], align 4 +// CHECK2-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR4:[0-9]+]] +// CHECK2-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR4]] +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[CALL]], [[CALL8]] +// CHECK2-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR4]] +// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD9]], [[CALL10]] +// CHECK2-NEXT: store i32 [[ADD11]], i32* [[TMP0]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK2-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK2: omp.dispatch.inc: +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK2-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP23]], [[TMP24]] +// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK2: omp.dispatch.end: +// CHECK2-NEXT: [[TMP25:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP26]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24 +// CHECK3-SAME: (i32 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[ARGC_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], i32* [[TMP0]], i32 [[TMP5]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK3-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP16]], i32* [[ARGC_CASTED]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP18]], i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK3-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK3-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK3-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK3-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP31]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, i32*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP32]], i32 5) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]] +// CHECK3-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK3: cond.true11: +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: br label [[COND_END13:%.*]] +// CHECK3: cond.false12: +// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END13]] +// CHECK3: cond.end13: +// CHECK3-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ] +// CHECK3-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP43]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP44:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP45]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP9]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[TMP7]]) +// CHECK3-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK3: omp.dispatch.cond: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp ugt i32 [[TMP10]], [[TMP11]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP14]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]] +// CHECK3-NEXT: br i1 [[CMP6]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK3: omp.dispatch.body: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP7:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]] +// CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP19]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I4]], align 4 +// CHECK3-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR4:[0-9]+]] +// CHECK3-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR4]] +// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[CALL]], [[CALL8]] +// CHECK3-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR4]] +// CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD9]], [[CALL10]] +// CHECK3-NEXT: store i32 [[ADD11]], i32* [[TMP0]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK3-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK3: omp.dispatch.inc: +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK3-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP23]], [[TMP24]] +// CHECK3-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK3: omp.dispatch.end: +// CHECK3-NEXT: [[TMP25:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP26]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24 +// CHECK4-SAME: (i64 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[ARGC_CASTED:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: store i64 [[DOTCAPTURE_EXPR_]], i64* [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32* +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: [[CONV2:%.*]] = bitcast i64* [[ARGC_CASTED]] to i32* +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK4-NEXT: [[CONV3:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__CASTED]] to i32* +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR__CASTED]], align 8 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], i32* [[TMP0]], i64 [[TMP5]]) #[[ATTR3:[0-9]+]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[ARGC_CASTED:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: store i64 [[DOTCAPTURE_EXPR_]], i64* [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32* +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK4-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK4-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK4-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: [[CONV8:%.*]] = bitcast i64* [[ARGC_CASTED]] to i32* +// CHECK4-NEXT: store i32 [[TMP18]], i32* [[CONV8]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8 +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK4-NEXT: [[CONV9:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__CASTED]] to i32* +// CHECK4-NEXT: store i32 [[TMP20]], i32* [[CONV9]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR__CASTED]], align 8 +// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK4-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK4-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK4-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK4-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK4-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK4-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK4-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK4-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK4-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK4-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 +// CHECK4-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK4-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP21]] to i8* +// CHECK4-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 +// CHECK4-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK4-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, i32*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i64 5) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK4-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK4-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK4-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK4-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK4-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK4: cond.true14: +// CHECK4-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK4-NEXT: br label [[COND_END16:%.*]] +// CHECK4: cond.false15: +// CHECK4-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END16]] +// CHECK4: cond.end16: +// CHECK4-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE14]] ], [ [[TMP44]], [[COND_FALSE15]] ] +// CHECK4-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I7:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: store i64 [[DOTCAPTURE_EXPR_]], i64* [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32* +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK4-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK4-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: [[CONV6:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK4-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[CONV6]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP9]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[TMP7]]) +// CHECK4-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK4: omp.dispatch.cond: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CONV8:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK4-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: [[CMP9:%.*]] = icmp ugt i64 [[CONV8]], [[TMP11]] +// CHECK4-NEXT: br i1 [[CMP9]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CONV10:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[CONV10]], [[COND_FALSE]] ] +// CHECK4-NEXT: [[CONV11:%.*]] = trunc i64 [[COND]] to i32 +// CHECK4-NEXT: store i32 [[CONV11]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP14]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]] +// CHECK4-NEXT: br i1 [[CMP12]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK4: omp.dispatch.body: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CMP13:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]] +// CHECK4-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP19]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I7]], align 4 +// CHECK4-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I7]]) #[[ATTR4:[0-9]+]] +// CHECK4-NEXT: [[CALL14:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR4]] +// CHECK4-NEXT: [[ADD15:%.*]] = add nsw i32 [[CALL]], [[CALL14]] +// CHECK4-NEXT: [[CALL16:%.*]] = call i32 @_Z3fooPi(i32* [[CONV]]) #[[ATTR4]] +// CHECK4-NEXT: [[ADD17:%.*]] = add nsw i32 [[ADD15]], [[CALL16]] +// CHECK4-NEXT: store i32 [[ADD17]], i32* [[TMP0]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[ADD18:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK4-NEXT: store i32 [[ADD18]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK4: omp.dispatch.inc: +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD19:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK4-NEXT: store i32 [[ADD19]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD20:%.*]] = add nsw i32 [[TMP23]], [[TMP24]] +// CHECK4-NEXT: store i32 [[ADD20]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK4: omp.dispatch.end: +// CHECK4-NEXT: [[TMP25:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP26]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24 +// CHECK5-SAME: (i32 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[ARGC_CASTED]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], i32* [[TMP0]], i32 [[TMP5]]) #[[ATTR3:[0-9]+]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK5-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK5-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK5-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP16]], i32* [[ARGC_CASTED]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP18]], i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK5-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK5-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK5-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK5-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK5-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK5-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK5-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK5-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK5-NEXT: [[TMP30:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 +// CHECK5-NEXT: [[TMP32:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP31]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, i32*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP32]], i32 5) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK5-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]] +// CHECK5-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK5: cond.true11: +// CHECK5-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: br label [[COND_END13:%.*]] +// CHECK5: cond.false12: +// CHECK5-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END13]] +// CHECK5: cond.end13: +// CHECK5-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ] +// CHECK5-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP43]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP44:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP45]]) +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP9]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[TMP7]]) +// CHECK5-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK5: omp.dispatch.cond: +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: [[CMP5:%.*]] = icmp ugt i32 [[TMP10]], [[TMP11]] +// CHECK5-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP14]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]] +// CHECK5-NEXT: br i1 [[CMP6]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK5: omp.dispatch.body: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[CMP7:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]] +// CHECK5-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP19]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I4]], align 4 +// CHECK5-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR4:[0-9]+]] +// CHECK5-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR4]] +// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[CALL]], [[CALL8]] +// CHECK5-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR4]] +// CHECK5-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD9]], [[CALL10]] +// CHECK5-NEXT: store i32 [[ADD11]], i32* [[TMP0]], align 4 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK5-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK5: omp.dispatch.inc: +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK5-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP23]], [[TMP24]] +// CHECK5-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK5: omp.dispatch.end: +// CHECK5-NEXT: [[TMP25:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP26]]) +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24 +// CHECK6-SAME: (i32 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[ARGC_CASTED]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], i32* [[TMP0]], i32 [[TMP5]]) #[[ATTR3:[0-9]+]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK6-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK6-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK6-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP16]], i32* [[ARGC_CASTED]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP18]], i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK6-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK6-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK6-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK6-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK6-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK6-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK6-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK6-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK6-NEXT: [[TMP30:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 +// CHECK6-NEXT: [[TMP32:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP31]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, i32*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP32]], i32 5) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK6-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]] +// CHECK6-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK6: cond.true11: +// CHECK6-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: br label [[COND_END13:%.*]] +// CHECK6: cond.false12: +// CHECK6-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END13]] +// CHECK6: cond.end13: +// CHECK6-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ] +// CHECK6-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP43]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP44:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP45]]) +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP9]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[TMP7]]) +// CHECK6-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK6: omp.dispatch.cond: +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: [[CMP5:%.*]] = icmp ugt i32 [[TMP10]], [[TMP11]] +// CHECK6-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP14]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]] +// CHECK6-NEXT: br i1 [[CMP6]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK6: omp.dispatch.body: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[CMP7:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]] +// CHECK6-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP19]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I4]], align 4 +// CHECK6-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR4:[0-9]+]] +// CHECK6-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR4]] +// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[CALL]], [[CALL8]] +// CHECK6-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR4]] +// CHECK6-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD9]], [[CALL10]] +// CHECK6-NEXT: store i32 [[ADD11]], i32* [[TMP0]], align 4 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK6-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK6: omp.dispatch.inc: +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK6-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP23]], [[TMP24]] +// CHECK6-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK6: omp.dispatch.end: +// CHECK6-NEXT: [[TMP25:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP26]]) +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp @@ -1,32 +1,27 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" --prefix-filecheck-ir-name _ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK1 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK2 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK3 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK4 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK5 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK6 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK7 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK8 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK9 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK10 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK11 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK12 // expected-no-diagnostics #ifndef HEADER #define HEADER -// Check that the execution mode of all 4 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l49}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l54}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l59}}_exec_mode = weak constant i8 0 - #define N 1000 #define M 10 @@ -75,83 +70,12785 @@ return a; } -// SEQ-DAG: [[MEM_TY:%.+]] = type { [128 x i8] } -// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef -// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef -// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 - -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l43( -// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0) -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) - -// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], -// SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ: [[PTR:%.+]] = getelementptr inbounds i8, i8* [[TEAM_ALLOC]], i{{64|32}} 0 -// PAR: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) -// CHECK: [[BC:%.+]] = bitcast i8* [[PTR]] to [[REC:%.+]]* -// CHECK: getelementptr inbounds [[REC]], [[REC]]* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, -// CHECK: {{call|invoke}} void [[OUTL1:@.+]]( -// CHECK: call void @__kmpc_for_static_fini( -// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]]) -// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[PTR]]) -// CHECK: ret void - -// CHECK: define internal void [[OUTL1]]( -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, -// CHECK: call void @__kmpc_for_static_fini( -// CHECK: ret void - -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( -// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0) -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) - -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, -// CHECK: {{call|invoke}} void [[OUTL2:@.+]]( -// CHECK: call void @__kmpc_for_static_fini( -// CHECK: ret void - -// CHECK: define internal void [[OUTL2]]( -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, -// CHECK: call void @__kmpc_for_static_fini( -// CHECK: ret void - -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( -// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0) -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) - -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, -// CHECK: {{call|invoke}} void [[OUTL3:@.+]]( -// CHECK: call void @__kmpc_for_static_fini( -// CHECK: ret void - -// CHECK: define internal void [[OUTL3]]( -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, -// CHECK: call void @__kmpc_for_static_fini( -// CHECK: ret void - -// CHECK: define {{.*}}void {{@__omp_offloading_.+}}({{.+}}, i{{32|64}} [[F_IN:%.+]]) -// CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}}, -// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0) -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) - -// CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]], -// CHECK: {{call|invoke}} void [[OUTL4:@.+]]( -// CHECK: call void @__kmpc_for_static_fini( -// CHECK: ret void - -// CHECK: define internal void [[OUTL4]]( -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, -// CHECK: call void @__kmpc_for_static_fini( -// CHECK: ret void - #endif +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK1-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 +// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i64 0 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK1-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK1-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP23]], i32* [[CONV8]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP25]], i32* [[CONV9]], align 4 +// CHECK1-NEXT: [[TMP26:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK1-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP30:%.*]] = inttoptr i64 [[TMP22]] to i8* +// CHECK1-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP24]] to i8* +// CHECK1-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP34:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 +// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK1-NEXT: [[TMP36:%.*]] = inttoptr i64 [[TMP26]] to i8* +// CHECK1-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 +// CHECK1-NEXT: [[TMP37:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 +// CHECK1-NEXT: [[TMP39:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP38]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP39]], i64 5) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK1-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK1-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP46]], [[TMP47]] +// CHECK1-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK1: cond.true14: +// CHECK1-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: br label [[COND_END16:%.*]] +// CHECK1: cond.false15: +// CHECK1-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END16]] +// CHECK1: cond.end16: +// CHECK1-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP48]], [[COND_TRUE14]] ], [ [[TMP49]], [[COND_FALSE15]] ] +// CHECK1-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP50]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP51:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) +// CHECK1-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 +// CHECK1-NEXT: br i1 [[TMP54]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB18:%.*]] = sub nsw i32 [[TMP55]], 0 +// CHECK1-NEXT: [[DIV19:%.*]] = sdiv i32 [[SUB18]], 1 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV19]], 1 +// CHECK1-NEXT: [[ADD20:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD20]], i32* [[I5]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: [[TMP56:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP57:%.*]] = icmp ne i32 [[TMP56]], 0 +// CHECK1-NEXT: br i1 [[TMP57]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK1: .omp.lastprivate.then: +// CHECK1-NEXT: [[TMP58:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: store i32 [[TMP58]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK1: .omp.lastprivate.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: [[TMP59:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP59]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK1: omp.dispatch.cond: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK1-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK1-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK1-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK1: omp.dispatch.body: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK1-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK1-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK1-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK1: omp.dispatch.inc: +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK1-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK1: omp.dispatch.end: +// CHECK1-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK1-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB16:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK1-NEXT: [[DIV17:%.*]] = sdiv i32 [[SUB16]], 1 +// CHECK1-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 +// CHECK1-NEXT: [[ADD19:%.*]] = add nsw i32 0, [[MUL18]] +// CHECK1-NEXT: store i32 [[ADD19]], i32* [[I6]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK1-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK1: .omp.lastprivate.then: +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: store i32 [[TMP33]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK1: .omp.lastprivate.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK1-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK1-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK1-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK1-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK1-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK1-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK1-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK1-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK1: cond.true11: +// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END13:%.*]] +// CHECK1: cond.false12: +// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END13]] +// CHECK1: cond.end13: +// CHECK1-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK1-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP45:%.*]] = icmp ne i32 [[TMP44]], 0 +// CHECK1-NEXT: br i1 [[TMP45]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP46]], 0 +// CHECK1-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK1-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD17]], i32* [[I3]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK1-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK1-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK1-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK1-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB12:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK1-NEXT: [[DIV13:%.*]] = sdiv i32 [[SUB12]], 1 +// CHECK1-NEXT: [[MUL14:%.*]] = mul nsw i32 [[DIV13]], 1 +// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i32 0, [[MUL14]] +// CHECK1-NEXT: store i32 [[ADD15]], i32* [[I5]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK1-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK1-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK1-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK1: cond.true5: +// CHECK1-NEXT: br label [[COND_END7:%.*]] +// CHECK1: cond.false6: +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END7]] +// CHECK1: cond.end7: +// CHECK1-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK1-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0 +// CHECK1-NEXT: br i1 [[TMP28]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK1-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK1-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK1-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK1-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK1-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK1: cond.true7: +// CHECK1-NEXT: br label [[COND_END9:%.*]] +// CHECK1: cond.false8: +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END9]] +// CHECK1: cond.end9: +// CHECK1-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK1-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK1-NEXT: br i1 [[TMP32]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK1-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK1-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK1-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK1-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK1-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK1-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK1-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK1-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK2-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) +// CHECK2-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK2-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK2-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP20]], i32* [[CONV8]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK2-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP22]], i32* [[CONV9]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK2-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK2-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP21]] to i8* +// CHECK2-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 +// CHECK2-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP31:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 +// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK2-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP23]] to i8* +// CHECK2-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 8 +// CHECK2-NEXT: [[TMP34:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[TMP34]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP35]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP36]], i64 5) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK2-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK2-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP41]], [[TMP42]] +// CHECK2-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP43]], [[TMP44]] +// CHECK2-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK2: cond.true14: +// CHECK2-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: br label [[COND_END16:%.*]] +// CHECK2: cond.false15: +// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END16]] +// CHECK2: cond.end16: +// CHECK2-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP45]], [[COND_TRUE14]] ], [ [[TMP46]], [[COND_FALSE15]] ] +// CHECK2-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP47]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP48:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP49:%.*]] = load i32, i32* [[TMP48]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP49]]) +// CHECK2-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0 +// CHECK2-NEXT: br i1 [[TMP51]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB18:%.*]] = sub nsw i32 [[TMP52]], 0 +// CHECK2-NEXT: [[DIV19:%.*]] = sdiv i32 [[SUB18]], 1 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV19]], 1 +// CHECK2-NEXT: [[ADD20:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD20]], i32* [[I5]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 +// CHECK2-NEXT: br i1 [[TMP54]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK2: .omp.lastprivate.then: +// CHECK2-NEXT: [[TMP55:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK2-NEXT: store i32 [[TMP55]], i32* [[CONV1]], align 8 +// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK2: .omp.lastprivate.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK2-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK2: omp.dispatch.cond: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK2-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK2-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK2-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK2-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK2-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK2: omp.dispatch.body: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK2-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK2-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK2-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK2: omp.dispatch.inc: +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK2-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK2: omp.dispatch.end: +// CHECK2-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK2-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB16:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK2-NEXT: [[DIV17:%.*]] = sdiv i32 [[SUB16]], 1 +// CHECK2-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 +// CHECK2-NEXT: [[ADD19:%.*]] = add nsw i32 0, [[MUL18]] +// CHECK2-NEXT: store i32 [[ADD19]], i32* [[I6]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK2-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK2: .omp.lastprivate.then: +// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK2-NEXT: store i32 [[TMP33]], i32* [[CONV1]], align 8 +// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK2: .omp.lastprivate.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK2-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK2-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK2-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK2: cond.true11: +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: br label [[COND_END13:%.*]] +// CHECK2: cond.false12: +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END13]] +// CHECK2: cond.end13: +// CHECK2-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK2-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP45:%.*]] = icmp ne i32 [[TMP44]], 0 +// CHECK2-NEXT: br i1 [[TMP45]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP46]], 0 +// CHECK2-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK2-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD17]], i32* [[I3]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK2-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK2-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK2-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK2-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK2-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB12:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK2-NEXT: [[DIV13:%.*]] = sdiv i32 [[SUB12]], 1 +// CHECK2-NEXT: [[MUL14:%.*]] = mul nsw i32 [[DIV13]], 1 +// CHECK2-NEXT: [[ADD15:%.*]] = add nsw i32 0, [[MUL14]] +// CHECK2-NEXT: store i32 [[ADD15]], i32* [[I5]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK2-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK2-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK2-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK2-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK2: cond.true5: +// CHECK2-NEXT: br label [[COND_END7:%.*]] +// CHECK2: cond.false6: +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END7]] +// CHECK2: cond.end7: +// CHECK2-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK2-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0 +// CHECK2-NEXT: br i1 [[TMP28]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK2-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK2-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK2-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK2-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK2-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK2-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK2-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK2-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK2-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK2-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK2-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK2-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK2-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK2: cond.true7: +// CHECK2-NEXT: br label [[COND_END9:%.*]] +// CHECK2: cond.false8: +// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END9]] +// CHECK2: cond.end9: +// CHECK2-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK2-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK2-NEXT: br i1 [[TMP32]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK2-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK2-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK2-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK2-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK2-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK2-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK2-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK2-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK2-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK2-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK2-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK3-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK3-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK3-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK3-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK3-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK3-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* +// CHECK3-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +// CHECK3-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] +// CHECK3-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK3: cond.true11: +// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: br label [[COND_END13:%.*]] +// CHECK3: cond.false12: +// CHECK3-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END13]] +// CHECK3: cond.end13: +// CHECK3-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] +// CHECK3-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) +// CHECK3-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK3-NEXT: br i1 [[TMP52]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP53]], 0 +// CHECK3-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK3-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 +// CHECK3-NEXT: br i1 [[TMP55]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK3: .omp.lastprivate.then: +// CHECK3-NEXT: [[TMP56:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP56]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK3: .omp.lastprivate.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: [[TMP57:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP57]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK3-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK3: omp.dispatch.cond: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK3: omp.dispatch.body: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK3-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK3-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK3-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK3: omp.dispatch.inc: +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK3: omp.dispatch.end: +// CHECK3-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK3-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK3-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK3-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK3-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK3-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK3-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK3: .omp.lastprivate.then: +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK3: .omp.lastprivate.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK3-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK3-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK3-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK3-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK3: cond.true10: +// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END12:%.*]] +// CHECK3: cond.false11: +// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END12]] +// CHECK3: cond.end12: +// CHECK3-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK3-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK3-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK3-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK3-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK3-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK3-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK3-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK3-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK3-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK3-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK3-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK3-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK3-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK3: cond.true5: +// CHECK3-NEXT: br label [[COND_END7:%.*]] +// CHECK3: cond.false6: +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END7]] +// CHECK3: cond.end7: +// CHECK3-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK3-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK3-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK3-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK3-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK3-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK3-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK3-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK3-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK3-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK3: cond.true6: +// CHECK3-NEXT: br label [[COND_END8:%.*]] +// CHECK3: cond.false7: +// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END8]] +// CHECK3: cond.end8: +// CHECK3-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK3-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK3-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK3-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK3-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK3-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK3-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK3-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK3-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK3-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK3-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK3-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK4-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK4-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK4-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK4-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK4-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK4-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK4-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK4-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK4-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK4-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK4-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK4-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* +// CHECK4-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK4-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +// CHECK4-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] +// CHECK4-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK4: cond.true11: +// CHECK4-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: br label [[COND_END13:%.*]] +// CHECK4: cond.false12: +// CHECK4-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END13]] +// CHECK4: cond.end13: +// CHECK4-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] +// CHECK4-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) +// CHECK4-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK4-NEXT: br i1 [[TMP52]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK4: .omp.final.then: +// CHECK4-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP53]], 0 +// CHECK4-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK4-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK4: .omp.final.done: +// CHECK4-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 +// CHECK4-NEXT: br i1 [[TMP55]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK4: .omp.lastprivate.then: +// CHECK4-NEXT: [[TMP56:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP56]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK4: .omp.lastprivate.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: [[TMP57:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP57]]) +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK4-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK4: omp.dispatch.cond: +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK4: omp.dispatch.body: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK4-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK4-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK4-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK4: omp.dispatch.inc: +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK4: omp.dispatch.end: +// CHECK4-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK4-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK4: .omp.final.then: +// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK4-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK4-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK4-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK4-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK4: .omp.final.done: +// CHECK4-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK4-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK4: .omp.lastprivate.then: +// CHECK4-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK4: .omp.lastprivate.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK4-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK4-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK4-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK4-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK4-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK4-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK4-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK4-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK4-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK4: cond.true10: +// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: br label [[COND_END12:%.*]] +// CHECK4: cond.false11: +// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END12]] +// CHECK4: cond.end12: +// CHECK4-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK4-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK4-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK4-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK4: .omp.final.then: +// CHECK4-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK4-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK4-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK4: .omp.final.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK4-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK4-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK4-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK4-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK4-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK4-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK4: .omp.final.then: +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK4-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK4-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK4-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK4-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK4: .omp.final.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK4-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK4-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK4-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK4-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK4-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK4: cond.true5: +// CHECK4-NEXT: br label [[COND_END7:%.*]] +// CHECK4: cond.false6: +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END7]] +// CHECK4: cond.end7: +// CHECK4-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK4-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK4-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK4: .omp.final.then: +// CHECK4-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK4: .omp.final.done: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK4-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK4-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK4-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK4: .omp.final.then: +// CHECK4-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK4: .omp.final.done: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK4-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK4-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK4-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK4-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK4-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK4-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK4-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK4-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK4-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK4: cond.true6: +// CHECK4-NEXT: br label [[COND_END8:%.*]] +// CHECK4: cond.false7: +// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END8]] +// CHECK4: cond.end8: +// CHECK4-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK4-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK4-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK4-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK4: .omp.final.then: +// CHECK4-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK4-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK4: .omp.final.done: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK4-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK4-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK4-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK4-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK4-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK4-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK4-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK4-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK4: .omp.final.then: +// CHECK4-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK4-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK4: .omp.final.done: +// CHECK4-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK5-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK5-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK5-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK5-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK5-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK5-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK5-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK5-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* +// CHECK5-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK5-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK5-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK5-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK5-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK5-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK5-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK5-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK5-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK5-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK5-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK5-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK5-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK5: cond.true11: +// CHECK5-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: br label [[COND_END13:%.*]] +// CHECK5: cond.false12: +// CHECK5-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END13]] +// CHECK5: cond.end13: +// CHECK5-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] +// CHECK5-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK5-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK5-NEXT: br i1 [[TMP49]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK5: .omp.final.then: +// CHECK5-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP50]], 0 +// CHECK5-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK5-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK5: .omp.final.done: +// CHECK5-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK5-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK5: .omp.lastprivate.then: +// CHECK5-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK5: .omp.lastprivate.done: +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK5-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK5-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK5: omp.dispatch.cond: +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK5-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK5: omp.dispatch.body: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK5-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK5-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK5-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK5: omp.dispatch.inc: +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK5-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK5: omp.dispatch.end: +// CHECK5-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK5-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK5: .omp.final.then: +// CHECK5-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK5-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK5-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK5-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK5-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK5: .omp.final.done: +// CHECK5-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK5-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK5: .omp.lastprivate.then: +// CHECK5-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK5: .omp.lastprivate.done: +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK5-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK5-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK5-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK5-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK5-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK5-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK5-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK5-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK5-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK5-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK5-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK5: cond.true10: +// CHECK5-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: br label [[COND_END12:%.*]] +// CHECK5: cond.false11: +// CHECK5-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END12]] +// CHECK5: cond.end12: +// CHECK5-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK5-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK5-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK5-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK5: .omp.final.then: +// CHECK5-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK5-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK5-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK5: .omp.final.done: +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK5-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK5-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK5-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK5-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK5-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK5-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK5: .omp.final.then: +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK5-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK5-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK5-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK5-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK5: .omp.final.done: +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK5-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK5-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK5-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK5-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK5-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK5: cond.true5: +// CHECK5-NEXT: br label [[COND_END7:%.*]] +// CHECK5: cond.false6: +// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END7]] +// CHECK5: cond.end7: +// CHECK5-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK5-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK5-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK5: .omp.final.then: +// CHECK5-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK5: .omp.final.done: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK5-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK5-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK5-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK5: .omp.final.then: +// CHECK5-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK5: .omp.final.done: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK5-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK5-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK5-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK5-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK5-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK5-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK5-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK5-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK5-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK5-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK5-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK5: cond.true6: +// CHECK5-NEXT: br label [[COND_END8:%.*]] +// CHECK5: cond.false7: +// CHECK5-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END8]] +// CHECK5: cond.end8: +// CHECK5-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK5-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK5-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK5: .omp.final.then: +// CHECK5-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK5-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK5: .omp.final.done: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK5-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK5-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK5-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK5-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK5-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK5-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK5-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK5-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK5-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK5-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK5: .omp.final.then: +// CHECK5-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK5-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK5: .omp.final.done: +// CHECK5-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK6-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK6-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK6-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK6-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK6-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK6-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK6-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK6-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* +// CHECK6-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK6-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK6-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK6-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK6-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK6-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK6-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK6-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK6-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK6-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK6-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK6-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK6-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK6: cond.true11: +// CHECK6-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: br label [[COND_END13:%.*]] +// CHECK6: cond.false12: +// CHECK6-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END13]] +// CHECK6: cond.end13: +// CHECK6-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] +// CHECK6-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK6-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK6-NEXT: br i1 [[TMP49]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK6: .omp.final.then: +// CHECK6-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP50]], 0 +// CHECK6-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK6-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK6: .omp.final.done: +// CHECK6-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK6-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK6: .omp.lastprivate.then: +// CHECK6-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK6: .omp.lastprivate.done: +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK6-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK6-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK6: omp.dispatch.cond: +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK6-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK6: omp.dispatch.body: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK6-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK6-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK6-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK6: omp.dispatch.inc: +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK6-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK6: omp.dispatch.end: +// CHECK6-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK6-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK6: .omp.final.then: +// CHECK6-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK6-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK6-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK6-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK6-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK6: .omp.final.done: +// CHECK6-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK6-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK6: .omp.lastprivate.then: +// CHECK6-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK6: .omp.lastprivate.done: +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK6-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK6-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK6-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK6-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK6-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK6-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK6-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK6-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK6-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK6-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK6-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK6: cond.true10: +// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: br label [[COND_END12:%.*]] +// CHECK6: cond.false11: +// CHECK6-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END12]] +// CHECK6: cond.end12: +// CHECK6-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK6-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK6-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK6-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK6: .omp.final.then: +// CHECK6-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK6-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK6-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK6: .omp.final.done: +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK6-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK6-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK6-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK6-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK6-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK6-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK6: .omp.final.then: +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK6-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK6-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK6-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK6-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK6: .omp.final.done: +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK6-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK6-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK6-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK6-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK6-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK6: cond.true5: +// CHECK6-NEXT: br label [[COND_END7:%.*]] +// CHECK6: cond.false6: +// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END7]] +// CHECK6: cond.end7: +// CHECK6-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK6-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK6-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK6: .omp.final.then: +// CHECK6-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK6: .omp.final.done: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK6-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK6-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK6-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK6: .omp.final.then: +// CHECK6-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK6: .omp.final.done: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK6-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK6-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK6-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK6-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK6-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK6-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK6-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK6-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK6-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK6-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK6-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK6: cond.true6: +// CHECK6-NEXT: br label [[COND_END8:%.*]] +// CHECK6: cond.false7: +// CHECK6-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END8]] +// CHECK6: cond.end8: +// CHECK6-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK6-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK6-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK6: .omp.final.then: +// CHECK6-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK6-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK6: .omp.final.done: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK6-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK6-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK6-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK6-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK6-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK6-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK6-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK6-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK6-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK6-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK6: .omp.final.then: +// CHECK6-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK6-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK6: .omp.final.done: +// CHECK6-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK7-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK7-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK7-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK7-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK7-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK7-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK7: .execute: +// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK7-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK7-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK7-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK7-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK7: .omp.deinit: +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK7-NEXT: br label [[DOTEXIT:%.*]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK7-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK7-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK7-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK7-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK7-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK7-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK7-NEXT: [[TMP2:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 +// CHECK7-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK7-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 +// CHECK7-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i64 0 +// CHECK7-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK7-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK7-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK7-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK7-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK7-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK7-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK7-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK7-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 +// CHECK7-NEXT: [[TMP23:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK7-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK7-NEXT: store i32 [[TMP23]], i32* [[CONV8]], align 4 +// CHECK7-NEXT: [[TMP24:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK7-NEXT: [[TMP25:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK7-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK7-NEXT: store i32 [[TMP25]], i32* [[CONV9]], align 4 +// CHECK7-NEXT: [[TMP26:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK7-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK7-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK7-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK7-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK7-NEXT: [[TMP30:%.*]] = inttoptr i64 [[TMP22]] to i8* +// CHECK7-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK7-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK7-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP24]] to i8* +// CHECK7-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 +// CHECK7-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK7-NEXT: [[TMP34:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK7-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 +// CHECK7-NEXT: [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK7-NEXT: [[TMP36:%.*]] = inttoptr i64 [[TMP26]] to i8* +// CHECK7-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 +// CHECK7-NEXT: [[TMP37:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 +// CHECK7-NEXT: [[TMP39:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP38]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP39]], i64 5) +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK7-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK7-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK7-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK7-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP46]], [[TMP47]] +// CHECK7-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK7: cond.true14: +// CHECK7-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK7-NEXT: br label [[COND_END16:%.*]] +// CHECK7: cond.false15: +// CHECK7-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END16]] +// CHECK7: cond.end16: +// CHECK7-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP48]], [[COND_TRUE14]] ], [ [[TMP49]], [[COND_FALSE15]] ] +// CHECK7-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP50]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: [[TMP51:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) +// CHECK7-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 +// CHECK7-NEXT: br i1 [[TMP54]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK7: .omp.final.then: +// CHECK7-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB18:%.*]] = sub nsw i32 [[TMP55]], 0 +// CHECK7-NEXT: [[DIV19:%.*]] = sdiv i32 [[SUB18]], 1 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV19]], 1 +// CHECK7-NEXT: [[ADD20:%.*]] = add nsw i32 0, [[MUL]] +// CHECK7-NEXT: store i32 [[ADD20]], i32* [[I5]], align 4 +// CHECK7-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK7: .omp.final.done: +// CHECK7-NEXT: [[TMP56:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP57:%.*]] = icmp ne i32 [[TMP56]], 0 +// CHECK7-NEXT: br i1 [[TMP57]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK7: .omp.lastprivate.then: +// CHECK7-NEXT: [[TMP58:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK7-NEXT: store i32 [[TMP58]], i32* [[CONV1]], align 8 +// CHECK7-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK7: .omp.lastprivate.done: +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: [[TMP59:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK7-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP59]]) +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK7-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK7-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK7-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK7-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK7-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK7-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK7-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK7-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK7-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK7-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK7-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK7-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK7-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK7: omp.dispatch.cond: +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK7-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK7-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK7-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK7-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK7-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK7-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK7: omp.dispatch.body: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK7-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK7-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK7-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK7-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK7-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK7: omp.body.continue: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK7-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK7: omp.dispatch.inc: +// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK7-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK7-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK7: omp.dispatch.end: +// CHECK7-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK7-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK7-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK7: .omp.final.then: +// CHECK7-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB16:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK7-NEXT: [[DIV17:%.*]] = sdiv i32 [[SUB16]], 1 +// CHECK7-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 +// CHECK7-NEXT: [[ADD19:%.*]] = add nsw i32 0, [[MUL18]] +// CHECK7-NEXT: store i32 [[ADD19]], i32* [[I6]], align 4 +// CHECK7-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK7: .omp.final.done: +// CHECK7-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK7-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK7: .omp.lastprivate.then: +// CHECK7-NEXT: [[TMP33:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK7-NEXT: store i32 [[TMP33]], i32* [[CONV1]], align 8 +// CHECK7-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK7: .omp.lastprivate.done: +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK7-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK7-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK7-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK7-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK7: .execute: +// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK7-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK7-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK7: .omp.deinit: +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK7-NEXT: br label [[DOTEXIT:%.*]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK7-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK7-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK7-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK7-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK7-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK7-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK7-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK7-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK7-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK7-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK7-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK7-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK7-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK7-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK7-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK7-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK7-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK7-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK7-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK7-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK7-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK7-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK7-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK7-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK7-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK7-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK7-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK7-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK7: cond.true11: +// CHECK7-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: br label [[COND_END13:%.*]] +// CHECK7: cond.false12: +// CHECK7-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END13]] +// CHECK7: cond.end13: +// CHECK7-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK7-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK7-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP45:%.*]] = icmp ne i32 [[TMP44]], 0 +// CHECK7-NEXT: br i1 [[TMP45]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK7: .omp.final.then: +// CHECK7-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP46]], 0 +// CHECK7-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK7-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK7-NEXT: store i32 [[ADD17]], i32* [[I3]], align 4 +// CHECK7-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK7: .omp.final.done: +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK7-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK7-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK7-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK7-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK7-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK7-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK7-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK7-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK7-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK7-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK7-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK7-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK7-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK7-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK7-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK7-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK7-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK7-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK7-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK7: omp.body.continue: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK7-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK7-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK7-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK7: .omp.final.then: +// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB12:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK7-NEXT: [[DIV13:%.*]] = sdiv i32 [[SUB12]], 1 +// CHECK7-NEXT: [[MUL14:%.*]] = mul nsw i32 [[DIV13]], 1 +// CHECK7-NEXT: [[ADD15:%.*]] = add nsw i32 0, [[MUL14]] +// CHECK7-NEXT: store i32 [[ADD15]], i32* [[I5]], align 4 +// CHECK7-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK7: .omp.final.done: +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK7-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK7-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK7: .execute: +// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK7-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK7: .omp.deinit: +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK7-NEXT: br label [[DOTEXIT:%.*]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK7-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK7-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK7-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK7-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK7-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK7-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK7-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK7-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK7-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK7-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK7-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK7-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK7-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK7-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK7-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK7-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK7: cond.true5: +// CHECK7-NEXT: br label [[COND_END7:%.*]] +// CHECK7: cond.false6: +// CHECK7-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END7]] +// CHECK7: cond.end7: +// CHECK7-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK7-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK7-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0 +// CHECK7-NEXT: br i1 [[TMP28]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK7: .omp.final.then: +// CHECK7-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK7-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK7: .omp.final.done: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK7-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK7-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK7-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK7-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK7-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK7-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK7-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK7-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK7-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK7-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK7-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK7-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK7: omp.body.continue: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK7-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK7-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK7: .omp.final.then: +// CHECK7-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK7-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK7: .omp.final.done: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK7-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK7-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK7-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK7-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK7: .execute: +// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK7-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK7-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK7: .omp.deinit: +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK7-NEXT: br label [[DOTEXIT:%.*]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK7-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK7-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK7-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK7-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK7-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK7-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK7-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK7-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK7-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK7-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK7-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK7-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK7-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK7-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK7-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK7-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK7-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK7-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK7-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK7-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK7-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK7-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK7-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK7-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK7: cond.true7: +// CHECK7-NEXT: br label [[COND_END9:%.*]] +// CHECK7: cond.false8: +// CHECK7-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END9]] +// CHECK7: cond.end9: +// CHECK7-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK7-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK7-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK7-NEXT: br i1 [[TMP32]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK7: .omp.final.then: +// CHECK7-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK7-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK7-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK7: .omp.final.done: +// CHECK7-NEXT: ret void +// +// +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK7-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK7-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK7-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK7-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK7-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK7-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK7-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK7-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK7-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK7-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK7-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK7-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK7-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK7-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK7-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK7-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK7-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK7-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK7-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK7-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK7-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK7-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK7-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK7: omp.body.continue: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK7-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK7-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK7-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK7: .omp.final.then: +// CHECK7-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK7-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK7-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK7: .omp.final.done: +// CHECK7-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK8-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK8-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK8-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK8: .execute: +// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK8-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK8: .omp.deinit: +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK8-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK8-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK8-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) +// CHECK8-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK8-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK8-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK8-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK8-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK8-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK8-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64 +// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP20]], i32* [[CONV8]], align 4 +// CHECK8-NEXT: [[TMP21:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK8-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP22]], i32* [[CONV9]], align 4 +// CHECK8-NEXT: [[TMP23:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK8-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK8-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK8-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK8-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK8-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK8-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK8-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK8-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP21]] to i8* +// CHECK8-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 +// CHECK8-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK8-NEXT: [[TMP31:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK8-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 +// CHECK8-NEXT: [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK8-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP23]] to i8* +// CHECK8-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 8 +// CHECK8-NEXT: [[TMP34:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP35:%.*]] = load i32, i32* [[TMP34]], align 4 +// CHECK8-NEXT: [[TMP36:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP35]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP36]], i64 5) +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK8-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK8-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP41]], [[TMP42]] +// CHECK8-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK8-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP43]], [[TMP44]] +// CHECK8-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK8: cond.true14: +// CHECK8-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK8-NEXT: br label [[COND_END16:%.*]] +// CHECK8: cond.false15: +// CHECK8-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END16]] +// CHECK8: cond.end16: +// CHECK8-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP45]], [[COND_TRUE14]] ], [ [[TMP46]], [[COND_FALSE15]] ] +// CHECK8-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP47]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP48:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP49:%.*]] = load i32, i32* [[TMP48]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP49]]) +// CHECK8-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0 +// CHECK8-NEXT: br i1 [[TMP51]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK8: .omp.final.then: +// CHECK8-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB18:%.*]] = sub nsw i32 [[TMP52]], 0 +// CHECK8-NEXT: [[DIV19:%.*]] = sdiv i32 [[SUB18]], 1 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV19]], 1 +// CHECK8-NEXT: [[ADD20:%.*]] = add nsw i32 0, [[MUL]] +// CHECK8-NEXT: store i32 [[ADD20]], i32* [[I5]], align 4 +// CHECK8-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK8: .omp.final.done: +// CHECK8-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 +// CHECK8-NEXT: br i1 [[TMP54]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK8: .omp.lastprivate.then: +// CHECK8-NEXT: [[TMP55:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK8-NEXT: store i32 [[TMP55]], i32* [[CONV1]], align 8 +// CHECK8-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK8: .omp.lastprivate.done: +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK8-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK8-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK8-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK8-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK8-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK8: omp.dispatch.cond: +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK8-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK8-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK8-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK8-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK8-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK8: omp.dispatch.body: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK8-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK8-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK8-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK8: omp.body.continue: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK8-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK8: omp.dispatch.inc: +// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK8-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK8-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK8: omp.dispatch.end: +// CHECK8-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK8-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK8-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK8: .omp.final.then: +// CHECK8-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB16:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK8-NEXT: [[DIV17:%.*]] = sdiv i32 [[SUB16]], 1 +// CHECK8-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 +// CHECK8-NEXT: [[ADD19:%.*]] = add nsw i32 0, [[MUL18]] +// CHECK8-NEXT: store i32 [[ADD19]], i32* [[I6]], align 4 +// CHECK8-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK8: .omp.final.done: +// CHECK8-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK8-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK8: .omp.lastprivate.then: +// CHECK8-NEXT: [[TMP33:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK8-NEXT: store i32 [[TMP33]], i32* [[CONV1]], align 8 +// CHECK8-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK8: .omp.lastprivate.done: +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK8-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK8-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK8: .execute: +// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK8: .omp.deinit: +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK8-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK8-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK8-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK8-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK8-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK8-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK8-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK8-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK8-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK8-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK8-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK8-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK8-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK8-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK8-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK8-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK8-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK8-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK8-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK8-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK8-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK8-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK8-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK8: cond.true11: +// CHECK8-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: br label [[COND_END13:%.*]] +// CHECK8: cond.false12: +// CHECK8-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END13]] +// CHECK8: cond.end13: +// CHECK8-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK8-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK8-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP45:%.*]] = icmp ne i32 [[TMP44]], 0 +// CHECK8-NEXT: br i1 [[TMP45]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK8: .omp.final.then: +// CHECK8-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP46]], 0 +// CHECK8-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK8-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK8-NEXT: store i32 [[ADD17]], i32* [[I3]], align 4 +// CHECK8-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK8: .omp.final.done: +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK8-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK8-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK8-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK8-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK8-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK8-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK8-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK8-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK8-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK8-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK8: omp.body.continue: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK8-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK8-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK8: .omp.final.then: +// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB12:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK8-NEXT: [[DIV13:%.*]] = sdiv i32 [[SUB12]], 1 +// CHECK8-NEXT: [[MUL14:%.*]] = mul nsw i32 [[DIV13]], 1 +// CHECK8-NEXT: [[ADD15:%.*]] = add nsw i32 0, [[MUL14]] +// CHECK8-NEXT: store i32 [[ADD15]], i32* [[I5]], align 4 +// CHECK8-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK8: .omp.final.done: +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK8-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK8: .execute: +// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK8: .omp.deinit: +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK8-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK8-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK8-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK8-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK8-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK8-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK8-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK8-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK8-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK8-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK8-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK8-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK8-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK8-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK8-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK8: cond.true5: +// CHECK8-NEXT: br label [[COND_END7:%.*]] +// CHECK8: cond.false6: +// CHECK8-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END7]] +// CHECK8: cond.end7: +// CHECK8-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK8-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK8-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0 +// CHECK8-NEXT: br i1 [[TMP28]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK8: .omp.final.then: +// CHECK8-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK8-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK8: .omp.final.done: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK8-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK8-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK8-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK8-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK8-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK8: omp.body.continue: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK8-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK8-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK8: .omp.final.then: +// CHECK8-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK8-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK8: .omp.final.done: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK8-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK8-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK8: .execute: +// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK8: .omp.deinit: +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK8-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK8-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK8-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK8-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK8-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK8-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK8-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK8-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK8-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK8-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK8-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK8-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK8-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK8-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK8-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK8-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK8-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK8-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK8-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK8-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK8: cond.true7: +// CHECK8-NEXT: br label [[COND_END9:%.*]] +// CHECK8: cond.false8: +// CHECK8-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END9]] +// CHECK8: cond.end9: +// CHECK8-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK8-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK8-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK8-NEXT: br i1 [[TMP32]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK8: .omp.final.then: +// CHECK8-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK8-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK8-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK8: .omp.final.done: +// CHECK8-NEXT: ret void +// +// +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK8-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK8-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK8-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK8-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK8-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK8-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK8-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK8-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK8-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK8-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK8-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK8-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK8-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK8-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK8: omp.body.continue: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK8-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK8-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK8: .omp.final.then: +// CHECK8-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK8-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK8-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK8: .omp.final.done: +// CHECK8-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK9-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK9-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK9: .execute: +// CHECK9-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK9-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK9: .omp.deinit: +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK9-NEXT: br label [[DOTEXIT:%.*]] +// CHECK9: .exit: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK9-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK9-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK9-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 +// CHECK9-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK9-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK9-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK9-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK9-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 +// CHECK9-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK9-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK9-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK9-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK9-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK9-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK9-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK9-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK9-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK9-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK9-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK9-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK9-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK9-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK9-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* +// CHECK9-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK9-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +// CHECK9-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK9-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK9-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK9-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] +// CHECK9-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK9: cond.true11: +// CHECK9-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: br label [[COND_END13:%.*]] +// CHECK9: cond.false12: +// CHECK9-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END13]] +// CHECK9: cond.end13: +// CHECK9-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] +// CHECK9-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) +// CHECK9-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK9-NEXT: br i1 [[TMP52]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP53]], 0 +// CHECK9-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK9-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 +// CHECK9-NEXT: br i1 [[TMP55]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK9: .omp.lastprivate.then: +// CHECK9-NEXT: [[TMP56:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP56]], i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK9: .omp.lastprivate.done: +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: [[TMP57:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK9-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP57]]) +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK9-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK9-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK9: omp.dispatch.cond: +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK9-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK9-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK9: omp.dispatch.body: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK9-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK9-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK9-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK9: omp.body.continue: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK9-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK9: omp.dispatch.inc: +// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK9-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK9-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK9: omp.dispatch.end: +// CHECK9-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK9-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK9-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK9-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK9-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK9-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK9-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK9-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK9: .omp.lastprivate.then: +// CHECK9-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK9: .omp.lastprivate.done: +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK9-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK9: .execute: +// CHECK9-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK9-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK9: .omp.deinit: +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK9-NEXT: br label [[DOTEXIT:%.*]] +// CHECK9: .exit: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK9-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK9-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK9-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK9-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK9-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK9-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK9-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK9-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK9-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK9-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK9-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK9-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK9-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK9-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK9-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK9-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK9-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK9-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK9-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK9-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK9-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK9: cond.true10: +// CHECK9-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: br label [[COND_END12:%.*]] +// CHECK9: cond.false11: +// CHECK9-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END12]] +// CHECK9: cond.end12: +// CHECK9-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK9-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK9-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK9-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK9-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK9-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK9-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK9-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK9-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK9-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK9-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK9-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK9: omp.body.continue: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK9-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK9-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK9-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK9-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK9-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK9-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK9-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK9: .execute: +// CHECK9-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK9-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK9: .omp.deinit: +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK9-NEXT: br label [[DOTEXIT:%.*]] +// CHECK9: .exit: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK9-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK9-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK9-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK9-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK9-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK9-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK9-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK9-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK9-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK9-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK9-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK9-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK9-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK9: cond.true5: +// CHECK9-NEXT: br label [[COND_END7:%.*]] +// CHECK9: cond.false6: +// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END7]] +// CHECK9: cond.end7: +// CHECK9-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK9-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK9-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK9-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK9-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK9-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK9: omp.body.continue: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK9-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK9-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK9-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK9-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK9-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK9: .execute: +// CHECK9-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK9-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK9: .omp.deinit: +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK9-NEXT: br label [[DOTEXIT:%.*]] +// CHECK9: .exit: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK9-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK9-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK9-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK9-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK9-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK9-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK9-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK9-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK9-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK9-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK9-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK9-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK9-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK9-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK9-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK9-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK9-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK9: cond.true6: +// CHECK9-NEXT: br label [[COND_END8:%.*]] +// CHECK9: cond.false7: +// CHECK9-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END8]] +// CHECK9: cond.end8: +// CHECK9-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK9-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK9-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK9-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK9-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK9-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK9-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK9-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK9-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK9-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK9-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK9-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK9-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK9-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK9: omp.body.continue: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK9-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK9-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK9-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK10-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK10-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK10: .execute: +// CHECK10-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK10-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK10: .omp.deinit: +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK10-NEXT: br label [[DOTEXIT:%.*]] +// CHECK10: .exit: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK10-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK10-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK10-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 +// CHECK10-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK10-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK10-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK10-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK10: cond.true: +// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: br label [[COND_END:%.*]] +// CHECK10: cond.false: +// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END]] +// CHECK10: cond.end: +// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK10-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK10-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 +// CHECK10-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK10-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK10-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK10-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK10-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK10-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK10-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK10-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK10-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK10-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK10-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK10-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK10-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK10-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK10-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* +// CHECK10-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK10-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +// CHECK10-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK10-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK10-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK10-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] +// CHECK10-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK10: cond.true11: +// CHECK10-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: br label [[COND_END13:%.*]] +// CHECK10: cond.false12: +// CHECK10-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END13]] +// CHECK10: cond.end13: +// CHECK10-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] +// CHECK10-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) +// CHECK10-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK10-NEXT: br i1 [[TMP52]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK10: .omp.final.then: +// CHECK10-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP53]], 0 +// CHECK10-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK10-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK10-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK10: .omp.final.done: +// CHECK10-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 +// CHECK10-NEXT: br i1 [[TMP55]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK10: .omp.lastprivate.then: +// CHECK10-NEXT: [[TMP56:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP56]], i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK10: .omp.lastprivate.done: +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: [[TMP57:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK10-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP57]]) +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK10-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK10-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK10-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK10: omp.dispatch.cond: +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK10-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK10: cond.true: +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: br label [[COND_END:%.*]] +// CHECK10: cond.false: +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END]] +// CHECK10: cond.end: +// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK10-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK10: omp.dispatch.body: +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK10-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK10-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK10-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK10: omp.body.continue: +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK10-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK10: omp.dispatch.inc: +// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK10-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK10-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK10: omp.dispatch.end: +// CHECK10-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK10-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK10-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK10: .omp.final.then: +// CHECK10-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK10-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK10-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK10-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK10-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK10: .omp.final.done: +// CHECK10-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK10-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK10: .omp.lastprivate.then: +// CHECK10-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK10: .omp.lastprivate.done: +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK10-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK10: .execute: +// CHECK10-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK10-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK10: .omp.deinit: +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK10-NEXT: br label [[DOTEXIT:%.*]] +// CHECK10: .exit: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK10-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK10-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK10: cond.true: +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: br label [[COND_END:%.*]] +// CHECK10: cond.false: +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END]] +// CHECK10: cond.end: +// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK10-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK10-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK10-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK10-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK10-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK10-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK10-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK10-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK10-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK10-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK10-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK10-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK10-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK10-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK10-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK10-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK10-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK10-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK10-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK10-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK10: cond.true10: +// CHECK10-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: br label [[COND_END12:%.*]] +// CHECK10: cond.false11: +// CHECK10-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END12]] +// CHECK10: cond.end12: +// CHECK10-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK10-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK10-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK10-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK10: .omp.final.then: +// CHECK10-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK10-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK10-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK10-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK10: .omp.final.done: +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK10-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK10-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK10-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK10-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK10-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK10-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK10-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK10: omp.body.continue: +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK10-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK10-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK10: .omp.final.then: +// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK10-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK10-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK10-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK10-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK10: .omp.final.done: +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK10-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK10: .execute: +// CHECK10-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK10-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK10: .omp.deinit: +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK10-NEXT: br label [[DOTEXIT:%.*]] +// CHECK10: .exit: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK10-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK10: cond.true: +// CHECK10-NEXT: br label [[COND_END:%.*]] +// CHECK10: cond.false: +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END]] +// CHECK10: cond.end: +// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK10-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK10-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK10-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK10-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK10-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK10-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK10-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK10-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK10-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK10-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK10-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK10: cond.true5: +// CHECK10-NEXT: br label [[COND_END7:%.*]] +// CHECK10: cond.false6: +// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END7]] +// CHECK10: cond.end7: +// CHECK10-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK10-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK10-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK10: .omp.final.then: +// CHECK10-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK10: .omp.final.done: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK10-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK10-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK10: omp.body.continue: +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK10-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK10-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK10: .omp.final.then: +// CHECK10-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK10: .omp.final.done: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK10-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK10-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK10-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK10: .execute: +// CHECK10-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK10-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK10: .omp.deinit: +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK10-NEXT: br label [[DOTEXIT:%.*]] +// CHECK10: .exit: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK10-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK10-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK10: cond.true: +// CHECK10-NEXT: br label [[COND_END:%.*]] +// CHECK10: cond.false: +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END]] +// CHECK10: cond.end: +// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK10-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK10-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK10-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK10-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK10-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK10-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK10-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK10-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK10-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK10-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK10-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK10-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK10-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK10-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK10-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK10: cond.true6: +// CHECK10-NEXT: br label [[COND_END8:%.*]] +// CHECK10: cond.false7: +// CHECK10-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END8]] +// CHECK10: cond.end8: +// CHECK10-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK10-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK10-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK10-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK10: .omp.final.then: +// CHECK10-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK10-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK10: .omp.final.done: +// CHECK10-NEXT: ret void +// +// +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK10-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK10-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK10-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK10-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK10-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK10-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK10-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK10-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK10-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK10-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK10: omp.body.continue: +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK10-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK10-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK10: .omp.final.then: +// CHECK10-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK10-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK10: .omp.final.done: +// CHECK10-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK11-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK11-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK11-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK11-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK11-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK11-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK11-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK11-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* +// CHECK11-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK11-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK11-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK11-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK11-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK11-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK11-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK11-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK11-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK11-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK11-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK11-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK11-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK11-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK11-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK11-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK11-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK11: cond.true11: +// CHECK11-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: br label [[COND_END13:%.*]] +// CHECK11: cond.false12: +// CHECK11-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END13]] +// CHECK11: cond.end13: +// CHECK11-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] +// CHECK11-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK11-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK11-NEXT: br i1 [[TMP49]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP50]], 0 +// CHECK11-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK11-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK11-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK11: .omp.lastprivate.then: +// CHECK11-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK11: .omp.lastprivate.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK11-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK11-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK11: omp.dispatch.cond: +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK11: omp.dispatch.body: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK11-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK11-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK11-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK11: omp.dispatch.inc: +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK11-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK11: omp.dispatch.end: +// CHECK11-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK11-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK11-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK11-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK11-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK11-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK11-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK11-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK11: .omp.lastprivate.then: +// CHECK11-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK11: .omp.lastprivate.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK11-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK11-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK11-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK11-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK11-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK11-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK11-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK11-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK11-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK11-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK11-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK11-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK11-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK11-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK11: cond.true10: +// CHECK11-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: br label [[COND_END12:%.*]] +// CHECK11: cond.false11: +// CHECK11-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END12]] +// CHECK11: cond.end12: +// CHECK11-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK11-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK11-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK11-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK11-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK11-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK11-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK11-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK11-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK11-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK11-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK11-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK11-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK11-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK11-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK11-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK11-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK11-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK11-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK11-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK11-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK11-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK11-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK11-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK11-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK11-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK11-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK11: cond.true5: +// CHECK11-NEXT: br label [[COND_END7:%.*]] +// CHECK11: cond.false6: +// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END7]] +// CHECK11: cond.end7: +// CHECK11-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK11-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK11-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK11-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK11-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK11-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK11-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK11-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK11-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK11-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK11-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK11-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK11-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK11-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK11-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK11-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK11-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK11-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK11-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK11-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK11-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK11-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK11-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK11: cond.true6: +// CHECK11-NEXT: br label [[COND_END8:%.*]] +// CHECK11: cond.false7: +// CHECK11-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END8]] +// CHECK11: cond.end8: +// CHECK11-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK11-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK11-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK11-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK11-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK11-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK11-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK11-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK11-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK11-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK11-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK11-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK11-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK11-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK11-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK11-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK11-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK12-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK12-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK12-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK12-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK12-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK12-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK12-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK12-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* +// CHECK12-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK12-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK12-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK12-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK12-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK12-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK12-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK12-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK12-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK12-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK12-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK12-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK12-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK12-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK12: cond.true11: +// CHECK12-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: br label [[COND_END13:%.*]] +// CHECK12: cond.false12: +// CHECK12-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END13]] +// CHECK12: cond.end13: +// CHECK12-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] +// CHECK12-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK12-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK12-NEXT: br i1 [[TMP49]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP50]], 0 +// CHECK12-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK12-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK12-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK12: .omp.lastprivate.then: +// CHECK12-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK12: .omp.lastprivate.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK12-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK12-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK12: omp.dispatch.cond: +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK12-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK12: omp.dispatch.body: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK12-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK12-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK12-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK12: omp.dispatch.inc: +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK12-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK12: omp.dispatch.end: +// CHECK12-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK12-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK12-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK12-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK12-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK12-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK12-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK12-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK12: .omp.lastprivate.then: +// CHECK12-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK12: .omp.lastprivate.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK12-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK12-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK12-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK12-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK12-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK12-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK12-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK12-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK12-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK12-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK12-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK12-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK12: cond.true10: +// CHECK12-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: br label [[COND_END12:%.*]] +// CHECK12: cond.false11: +// CHECK12-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END12]] +// CHECK12: cond.end12: +// CHECK12-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK12-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK12-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK12-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK12-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK12-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK12-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK12-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK12-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK12-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK12-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK12-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK12-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK12-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK12-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK12-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK12-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK12-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK12-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK12-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK12-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK12-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK12-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK12-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK12-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK12: cond.true5: +// CHECK12-NEXT: br label [[COND_END7:%.*]] +// CHECK12: cond.false6: +// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END7]] +// CHECK12: cond.end7: +// CHECK12-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK12-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK12-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK12-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK12-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK12-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK12-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK12-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK12-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK12-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK12-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK12-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK12-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK12-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK12-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK12-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK12-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK12-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK12-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK12-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK12-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK12: cond.true6: +// CHECK12-NEXT: br label [[COND_END8:%.*]] +// CHECK12: cond.false7: +// CHECK12-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END8]] +// CHECK12: cond.end8: +// CHECK12-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK12-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK12-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK12-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK12-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: ret void +// +// +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK12-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK12-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK12-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK12-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK12-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK12-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK12-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK12-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK12-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK12-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: ret void +// diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp --- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp @@ -1,38 +1,17 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK1 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK2 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK3 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK4 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK5 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK6 // expected-no-diagnostics #ifndef HEADER #define HEADER -// CHECK-DAG: [[TEAM1_REDUCE_TY:%.+]] = type { [{{1024|2048}} x double] } -// CHECK-DAG: [[TEAM2_REDUCE_TY:%.+]] = type { [{{1024|2048}} x i8], [{{1024|2048}} x float] } -// CHECK-DAG: [[TEAM3_REDUCE_TY:%.+]] = type { [{{1024|2048}} x i32], [{{1024|2048}} x i16] } -// CHECK-DAG: [[TEAMS_REDUCE_UNION_TY:%.+]] = type { [[TEAM1_REDUCE_TY]] } -// SEQ-DAG: [[MAP_TY:%.+]] = type { [128 x i8] } - -// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef -// SEQ-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1 -// SEQ-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1 -// SEQ-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} {{16|8}} -// SEQ-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} 16 - -// Check for the data transfer medium in shared memory to transfer the reduction list to the first warp. -// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = weak addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32] - -// Check that the execution mode of 2 target regions is set to Non-SPMD and the 3rd is in SPMD. -// CHECK-DAG: {{@__omp_offloading_.+l44}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l57}}_exec_mode = weak constant i8 0 - -// CHECK-DAG: [[TEAMS_RED_BUFFER:@.+]] = internal global [[TEAMS_REDUCE_UNION_TY]] zeroinitializer - template tx ftemplate(int n) { int a; @@ -73,1328 +52,8681 @@ return a; } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l44}}_worker() - - // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l44]]( - // - // CHECK: {{call|invoke}} void [[T1]]_worker() - // - // CHECK: call void @__kmpc_kernel_init( - // CHECK: call void @__kmpc_kernel_deinit( - // - // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align - // CHECK: [[EV:%.+]] = load double, double* [[E]], align - // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5 - // CHECK: store double [[ADD]], double* [[E]], align - // CHECK: [[GEP1:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[BC:%.+]] = bitcast double* [[E]] to i8* - // CHECK: store i8* [[BC]], i8** [[GEP1]], - // CHECK: [[BC_RED_LIST:%.+]] = bitcast [1 x i8*]* [[RED_LIST]] to i8* - // CHECK: [[BUF:%.+]] = load i8*, i8** @ - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* [[BUF]], i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]]) - // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 - // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] - // - // CHECK: [[IFLABEL]] - // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align - // CHECK: [[EV:%.+]] = load double, double* [[E]], align - // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]] - // CHECK: store double [[ADD]], double* [[E_IN]], align - // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[GTID]]) - // CHECK: br label %[[EXIT]] - // - // CHECK: [[EXIT]] - - // - // Reduction function - // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1) - // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]], - // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double* - // - // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]], - // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double* - // - // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]], - // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]], - // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]] - // CHECK: store double [[RES]], double* [[VAR_LHS]], - // CHECK: ret void - - // - // Shuffle and reduce function - // CHECK: define internal void [[SHUFFLE_AND_REDUCE]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) - // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [1 x i8*], align - // CHECK: [[REMOTE_ELT:%.+]] = alloca double - // - // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* - // - // CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64* - // CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64* - // CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) - // - // CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align - // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8* - // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align - // - // Condition to reduce - // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 - // - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] - // - // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 - // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 - // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 - // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] - // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 - // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] - // - // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] - // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] - // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] - // - // CHECK: [[DO_REDUCE]] - // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [1 x i8*]* [[RED_LIST]] to i8* - // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [1 x i8*]* [[REMOTE_RED_LIST]] to i8* - // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) - // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] - // - // CHECK: [[REDUCE_ELSE]] - // CHECK: br label {{%?}}[[REDUCE_CONT]] - // - // CHECK: [[REDUCE_CONT]] - // Now check if we should just copy over the remote reduction list - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] - // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // CHECK: [[DO_COPY]] - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align - // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // CHECK: [[COPY_CONT]] - // CHECK: void - - // - // Inter warp copy function - // CHECK: define internal void [[INTER_WARP_COPY]](i8* %0, i32 %1) - // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 - // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 - // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [1 x i8*]* - // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]], - // CHECK: br label - // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]], - // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2 - // CHECK: br i1 [[DONE_COPY]], label - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]] - // - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], - // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]] - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], - // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1 - // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]], - // CHECK: br label - // CHECK: ret - - // CHECK: define internal void [[RED_LIST_TO_GLOBAL_COPY]](i8* %0, i32 %1, i8* %2) - // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, - // CHECK: [[IDX_PTR:%.+]] = alloca i32, - // CHECK: [[RL_PTR:%.+]] = alloca i8*, - // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], - // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], - // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], - // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], - // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [1 x i8*]* - // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], - // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]* - // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], - // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], - // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to double* - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: [[LOC_RED1:%.+]] = load double, double* [[RL_RED1]], - // CHECK: store double [[LOC_RED1]], double* [[GLOBAL_RED1_IDX_PTR]], - // CHECK: ret void - - // CHECK: define internal void [[RED_LIST_TO_GLOBAL_RED]](i8* %0, i32 %1, i8* %2) - // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, - // CHECK: [[IDX_PTR:%.+]] = alloca i32, - // CHECK: [[RL_PTR:%.+]] = alloca i8*, - // CHECK: [[LOCAL_RL:%.+]] = alloca [1 x i8*], - // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], - // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], - // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], - // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], - // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]* - // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], - // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast double* [[GLOBAL_RED1_IDX_PTR]] to i8* - // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] - // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [1 x i8*]* [[LOCAL_RL]] to i8* - // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], - // CHECK: call void [[REDUCTION_FUNC]](i8* [[LOCAL_RL_BC]], i8* [[RL_BC]]) - // CHECK: ret void - - // CHECK: define internal void [[GLOBAL_TO_RED_LIST_COPY]](i8* %0, i32 %1, i8* %2) - // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, - // CHECK: [[IDX_PTR:%.+]] = alloca i32, - // CHECK: [[RL_PTR:%.+]] = alloca i8*, - // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], - // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], - // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], - // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], - // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [1 x i8*]* - // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], - // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]* - // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], - // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], - // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to double* - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: [[GLOBAL_RED1:%.+]] = load double, double* [[GLOBAL_RED1_IDX_PTR]], - // CHECK: store double [[GLOBAL_RED1]], double* [[RL_RED1]], - // CHECK: ret void - - // CHECK: define internal void [[GLOBAL_TO_RED_LIST_RED]](i8* %0, i32 %1, i8* %2) - // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, - // CHECK: [[IDX_PTR:%.+]] = alloca i32, - // CHECK: [[RL_PTR:%.+]] = alloca i8*, - // CHECK: [[LOCAL_RL:%.+]] = alloca [1 x i8*], - // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], - // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], - // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], - // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], - // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]* - // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], - // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast double* [[GLOBAL_RED1_IDX_PTR]] to i8* - // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] - // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [1 x i8*]* [[LOCAL_RL]] to i8* - // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], - // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]]) - // CHECK: ret void - - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l50}}_worker() - - // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l50]]( - // - // CHECK: {{call|invoke}} void [[T2]]_worker() - - // - // CHECK: call void @__kmpc_kernel_init( - // CHECK: call void @__kmpc_kernel_deinit( - // - // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align - // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align - // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32 - // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2 - // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 - // CHECK: store i8 [[TRUNC]], i8* [[C]], align - // CHECK: [[DV:%.+]] = load float, float* [[D]], align - // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}} - // CHECK: store float [[MUL]], float* [[D]], align - // CHECK: [[GEP1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: store i8* [[C]], i8** [[GEP1]], - // CHECK: [[GEP2:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 - // CHECK: [[BC:%.+]] = bitcast float* [[D]] to i8* - // CHECK: store i8* [[BC]], i8** [[GEP2]], - // CHECK: [[BC_RED_LIST:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8* - // CHECK: [[BUF:%.+]] = load i8*, i8** @ - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* [[BUF]], i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]]) - // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 - // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] - // - // CHECK: [[IFLABEL]] - // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align - // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32 - // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align - // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32 - // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]] - // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 - // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align - // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align - // CHECK: [[DV:%.+]] = load float, float* [[D]], align - // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]] - // CHECK: store float [[MUL]], float* [[D_IN]], align - // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[GTID]]) - // CHECK: br label %[[EXIT]] - // - // CHECK: [[EXIT]] - - // - // Reduction function - // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1) - // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], - // - // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], - // - // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], - // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float* - // - // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], - // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float* - // - // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]], - // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32 - // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]], - // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32 - // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] - // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8 - // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]], - // - // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]], - // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]], - // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] - // CHECK: store float [[RES]], float* [[VAR2_LHS]], - // CHECK: ret void - - // - // Shuffle and reduce function - // CHECK: define internal void [[SHUFFLE_AND_REDUCE]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) - // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [2 x i8*], align - // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8 - // CHECK: [[REMOTE_ELT2:%.+]] = alloca float - // - // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align - // - // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32 - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) - // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8 - // - // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align - // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* - // - // CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32* - // CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32* - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) - // - // CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align - // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8* - // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align - // - // Condition to reduce - // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 - // - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] - // - // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 - // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 - // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 - // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] - // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 - // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] - // - // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] - // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] - // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] - // - // CHECK: [[DO_REDUCE]] - // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8* - // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [2 x i8*]* [[REMOTE_RED_LIST]] to i8* - // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) - // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] - // - // CHECK: [[REDUCE_ELSE]] - // CHECK: br label {{%?}}[[REDUCE_CONT]] - // - // CHECK: [[REDUCE_CONT]] - // Now check if we should just copy over the remote reduction list - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] - // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // CHECK: [[DO_COPY]] - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align - // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align - // - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align - // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // CHECK: [[COPY_CONT]] - // CHECK: void - - // - // Inter warp copy function - // CHECK: define internal void [[INTER_WARP_COPY]](i8* %0, i32 %1) - // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 - // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 - // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [2 x i8*]* - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align - // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align - // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: ret - - // CHECK: define internal void [[RED_LIST_TO_GLOBAL_COPY]](i8* %0, i32 %1, i8* %2) - // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, - // CHECK: [[IDX_PTR:%.+]] = alloca i32, - // CHECK: [[RL_PTR:%.+]] = alloca i8*, - // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], - // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], - // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], - // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], - // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]* - // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], - // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]* - // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], - // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[RL_RED1:%.+]] = load i8*, i8** [[RL_RED1_PTR]], - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: [[LOC_RED1:%.+]] = load i8, i8* [[RL_RED1]], - // CHECK: store i8 [[LOC_RED1]], i8* [[GLOBAL_RED1_IDX_PTR]], - // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 - // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], - // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to float* - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: [[LOC_RED1:%.+]] = load float, float* [[RL_RED1]], - // CHECK: store float [[LOC_RED1]], float* [[GLOBAL_RED1_IDX_PTR]], - // CHECK: ret void - - // CHECK: define internal void [[RED_LIST_TO_GLOBAL_RED]](i8* %0, i32 %1, i8* %2) - // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, - // CHECK: [[IDX_PTR:%.+]] = alloca i32, - // CHECK: [[RL_PTR:%.+]] = alloca i8*, - // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*], - // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], - // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], - // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], - // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], - // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]* - // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], - // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR]], i8** [[LOCAL_RL_RED1_PTR]] - // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast float* [[GLOBAL_RED1_IDX_PTR]] to i8* - // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] - // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8* - // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], - // CHECK: call void [[REDUCTION_FUNC]](i8* [[LOCAL_RL_BC]], i8* [[RL_BC]]) - // CHECK: ret void - - // CHECK: define internal void [[GLOBAL_TO_RED_LIST_COPY]](i8* %0, i32 %1, i8* %2) - // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, - // CHECK: [[IDX_PTR:%.+]] = alloca i32, - // CHECK: [[RL_PTR:%.+]] = alloca i8*, - // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], - // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], - // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], - // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], - // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]* - // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], - // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]* - // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], - // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[RL_RED1:%.+]] = load i8*, i8** [[RL_RED1_PTR]], - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: [[GLOBAL_RED1:%.+]] = load i8, i8* [[GLOBAL_RED1_IDX_PTR]], - // CHECK: store i8 [[GLOBAL_RED1]], i8* [[RL_RED1]], - // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 - // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], - // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to float* - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: [[GLOBAL_RED1:%.+]] = load float, float* [[GLOBAL_RED1_IDX_PTR]], - // CHECK: store float [[GLOBAL_RED1]], float* [[RL_RED1]], - // CHECK: ret void - - // CHECK: define internal void [[GLOBAL_TO_RED_LIST_RED]](i8* %0, i32 %1, i8* %2) - // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, - // CHECK: [[IDX_PTR:%.+]] = alloca i32, - // CHECK: [[RL_PTR:%.+]] = alloca i8*, - // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*], - // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], - // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], - // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], - // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], - // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]* - // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], - // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR]], i8** [[LOCAL_RL_RED1_PTR]] - // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast float* [[GLOBAL_RED1_IDX_PTR]] to i8* - // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] - // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8* - // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], - // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]]) - // CHECK: ret void - - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l57}}( - // - // CHECK: call void @__kmpc_spmd_kernel_init( - // CHECK: call void @__kmpc_data_sharing_init_stack_spmd() - // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) - - // CHECK-NOT: call void @{{__kmpc_get_team_static_memory|__kmpc_data_sharing_push_stack}} - // CHECK: store i32 0, - // CHECK: store i32 0, - // CHECK: store i32 0, i32* [[A_ADDR:%.+]], align - // CHECK: store i16 -32768, i16* [[B_ADDR:%.+]], align - // CHECK: call void [[OUTLINED:@.+]](i32* {{.+}}, i32* {{.+}}, i32* [[A_ADDR]], i16* [[B_ADDR]]) - // CHECK: [[GEP1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[BC:%.+]] = bitcast i32* [[A_ADDR]] to i8* - // CHECK: store i8* [[BC]], i8** [[GEP1]], - // CHECK: [[GEP2:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 - // CHECK: [[BC:%.+]] = bitcast i16* [[B_ADDR]] to i8* - // CHECK: store i8* [[BC]], i8** [[GEP2]], - // CHECK: [[BC_RED_LIST:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8* - // CHECK: [[BUF:%.+]] = load i8*, i8** @ - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* [[BUF]], i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]]) - // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 - // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] - // - // CHECK: [[IFLABEL]] - // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align - // CHECK: [[AV:%.+]] = load i32, i32* [[A_ADDR]], align - // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]] - // CHECK: store i32 [[OR]], i32* [[A_IN]], align - // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align - // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32 - // CHECK: [[BV16:%.+]] = load i16, i16* [[B_ADDR]], align - // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 - // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]] - // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] - // - // CHECK: [[DO_MAX]] - // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align - // CHECK: br label {{%?}}[[MAX_CONT:.+]] - // - // CHECK: [[MAX_ELSE]] - // CHECK: [[MAX2:%.+]] = load i16, i16* [[B_ADDR]], align - // CHECK: br label {{%?}}[[MAX_CONT]] - // - // CHECK: [[MAX_CONT]] - // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] - // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align - // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[GTID]]) - // CHECK: br label %[[EXIT]] - // - // CHECK: [[EXIT]] - - // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* nonnull align {{[0-9]+}} dereferenceable{{.+}}, i16* nonnull align {{[0-9]+}} dereferenceable{{.+}}) - // - // CHECK: store i32 0, i32* [[A:%.+]], align - // CHECK: store i16 -32768, i16* [[B:%.+]], align - // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align - // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1 - // CHECK: store i32 [[OR]], i32* [[A]], align - // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align - // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 - // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]] - // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] - // - // CHECK: [[DO_MAX]] - // CHECK: br label {{%?}}[[MAX_CONT:.+]] - // - // CHECK: [[MAX_ELSE]] - // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align - // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32 - // CHECK: br label {{%?}}[[MAX_CONT]] - // - // CHECK: [[MAX_CONT]] - // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ] - // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16 - // CHECK: store i16 [[TRUNC]], i16* [[B]], align - // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{.+}} 0, i[[SZ:.+]] 0 - // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8* - // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align - // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8* - // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align - // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* [[LOC]], i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[PAR_SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[PAR_WARP_COPY_FN:@.+]]) - // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 - // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] - // - // CHECK: [[IFLABEL]] - // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align - // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align - // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]] - // CHECK: store i32 [[OR]], i32* [[A_IN]], align - // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align - // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32 - // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align - // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 - // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]] - // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] - // - // CHECK: [[DO_MAX]] - // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align - // CHECK: br label {{%?}}[[MAX_CONT:.+]] - // - // CHECK: [[MAX_ELSE]] - // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align - // CHECK: br label {{%?}}[[MAX_CONT]] - // - // CHECK: [[MAX_CONT]] - // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] - // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align - // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( - // CHECK: br label %[[EXIT]] - // - // CHECK: [[EXIT]] - // CHECK: ret void - - // - // Reduction function - // CHECK: define internal void [[PAR_REDUCTION_FUNC:@.+]](i8* %0, i8* %1) - // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], - // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32* - // - // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], - // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32* - // - // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], - // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16* - // - // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], - // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16* - // - // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]], - // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]], - // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] - // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]], - // - // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]], - // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32 - // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]], - // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32 - // - // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] - // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] - // - // CHECK: [[DO_MAX]] - // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align - // CHECK: br label {{%?}}[[MAX_CONT:.+]] - // - // CHECK: [[MAX_ELSE]] - // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align - // CHECK: br label {{%?}}[[MAX_CONT]] - // - // CHECK: [[MAX_CONT]] - // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] - // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]], - // CHECK: ret void - // - // Shuffle and reduce function - // CHECK: define internal void [[PAR_SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) - // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align - // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 - // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 - // - // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align - // - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) - // - // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align - // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* - // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align - // - // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32 - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) - // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16 - // - // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align - // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* - // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align - // - // Condition to reduce - // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 - // - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] - // - // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 - // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 - // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 - // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] - // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 - // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] - // - // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] - // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] - // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] - // - // CHECK: [[DO_REDUCE]] - // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* - // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* - // CHECK: call void [[PAR_REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) - // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] - // - // CHECK: [[REDUCE_ELSE]] - // CHECK: br label {{%?}}[[REDUCE_CONT]] - // - // CHECK: [[REDUCE_CONT]] - // Now check if we should just copy over the remote reduction list - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] - // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // CHECK: [[DO_COPY]] - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align - // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align - // - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align - // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // CHECK: [[COPY_CONT]] - // CHECK: void - - // - // Inter warp copy function - // CHECK: define internal void [[PAR_WARP_COPY_FN]](i8* %0, i32 %1) - // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 - // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 - // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align - // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // - // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align - // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: ret - - // - // Reduction function - // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1) - // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], - // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32* - // - // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], - // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32* - // - // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], - // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16* - // - // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], - // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16* - // - // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]], - // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]], - // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] - // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]], - // - // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]], - // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32 - // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]], - // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32 - // - // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] - // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] - // - // CHECK: [[DO_MAX]] - // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align - // CHECK: br label {{%?}}[[MAX_CONT:.+]] - // - // CHECK: [[MAX_ELSE]] - // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align - // CHECK: br label {{%?}}[[MAX_CONT]] - // - // CHECK: [[MAX_CONT]] - // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] - // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]], - // CHECK: ret void - - // - // Shuffle and reduce function - // CHECK: define internal void [[SHUFFLE_AND_REDUCE]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) - // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [2 x i8*], align - // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 - // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 - // - // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align - // - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) - // - // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align - // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* - // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align - // - // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32 - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) - // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16 - // - // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align - // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* - // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align - // - // Condition to reduce - // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 - // - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] - // - // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 - // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 - // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 - // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] - // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 - // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] - // - // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] - // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] - // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] - // - // CHECK: [[DO_REDUCE]] - // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8* - // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [2 x i8*]* [[REMOTE_RED_LIST]] to i8* - // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) - // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] - // - // CHECK: [[REDUCE_ELSE]] - // CHECK: br label {{%?}}[[REDUCE_CONT]] - // - // CHECK: [[REDUCE_CONT]] - // Now check if we should just copy over the remote reduction list - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] - // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // CHECK: [[DO_COPY]] - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align - // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align - // - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align - // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // CHECK: [[COPY_CONT]] - // CHECK: void - - // - // Inter warp copy function - // CHECK: define internal void [[INTER_WARP_COPY]](i8* %0, i32 %1) - // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 - // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 - // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align - // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // - // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align - // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: ret - - // CHECK: define internal void [[RED_LIST_TO_GLOBAL_COPY]](i8* %0, i32 %1, i8* %2) - // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, - // CHECK: [[IDX_PTR:%.+]] = alloca i32, - // CHECK: [[RL_PTR:%.+]] = alloca i8*, - // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], - // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], - // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], - // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], - // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]* - // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], - // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]* - // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], - // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], - // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i32* - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: [[LOC_RED1:%.+]] = load i32, i32* [[RL_RED1]], - // CHECK: store i32 [[LOC_RED1]], i32* [[GLOBAL_RED1_IDX_PTR]], - // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 - // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], - // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i16* - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: [[LOC_RED1:%.+]] = load i16, i16* [[RL_RED1]], - // CHECK: store i16 [[LOC_RED1]], i16* [[GLOBAL_RED1_IDX_PTR]], - // CHECK: ret void - - // CHECK: define internal void [[RED_LIST_TO_GLOBAL_RED]](i8* %0, i32 %1, i8* %2) - // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, - // CHECK: [[IDX_PTR:%.+]] = alloca i32, - // CHECK: [[RL_PTR:%.+]] = alloca i8*, - // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*], - // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], - // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], - // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], - // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], - // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]* - // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], - // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i32* [[GLOBAL_RED1_IDX_PTR]] to i8* - // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] - // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i16* [[GLOBAL_RED1_IDX_PTR]] to i8* - // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] - // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8* - // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], - // CHECK: call void [[REDUCTION_FUNC]](i8* [[LOCAL_RL_BC]], i8* [[RL_BC]]) - // CHECK: ret void - - // CHECK: define internal void [[GLOBAL_TO_RED_LIST_COPY]](i8* %0, i32 %1, i8* %2) - // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, - // CHECK: [[IDX_PTR:%.+]] = alloca i32, - // CHECK: [[RL_PTR:%.+]] = alloca i8*, - // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], - // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], - // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], - // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], - // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]* - // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], - // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]* - // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], - // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], - // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i32* - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: [[GLOBAL_RED1:%.+]] = load i32, i32* [[GLOBAL_RED1_IDX_PTR]], - // CHECK: store i32 [[GLOBAL_RED1]], i32* [[RL_RED1]], - // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 - // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], - // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i16* - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: [[GLOBAL_RED1:%.+]] = load i16, i16* [[GLOBAL_RED1_IDX_PTR]], - // CHECK: store i16 [[GLOBAL_RED1]], i16* [[RL_RED1]], - // CHECK: ret void - - // CHECK: define internal void [[GLOBAL_TO_RED_LIST_RED]](i8* %0, i32 %1, i8* %2) - // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, - // CHECK: [[IDX_PTR:%.+]] = alloca i32, - // CHECK: [[RL_PTR:%.+]] = alloca i8*, - // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*], - // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], - // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], - // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], - // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], - // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]* - // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], - // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i32* [[GLOBAL_RED1_IDX_PTR]] to i8* - // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] - // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 - // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 - // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] - // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i16* [[GLOBAL_RED1_IDX_PTR]] to i8* - // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] - // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8* - // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], - // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]]) - // CHECK: ret void - #endif +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker +// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 +// CHECK1-SAME: (i64 [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[E_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[E]], i64* [[E_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[E_ADDR]] to double* +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK1-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 +// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty* +// CHECK1-NEXT: [[TMP10:%.*]] = load double, double* [[CONV]], align 8 +// CHECK1-NEXT: [[E7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0 +// CHECK1-NEXT: store double [[TMP10]], double* [[E7]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]]) +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[E_ADDR:%.*]] = alloca double*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store double* [[E]], double** [[E_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 +// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.0* +// CHECK1-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP3]], i32 0, i32 0 +// CHECK1-NEXT: store double 0.000000e+00, double* [[E1]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load double, double* [[E1]], align 8 +// CHECK1-NEXT: [[ADD:%.*]] = fadd double [[TMP4]], 5.000000e+00 +// CHECK1-NEXT: store double [[ADD]], double* [[E1]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast double* [[E1]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i8* [[TMP10]], i32 1024, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) +// CHECK1-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1 +// CHECK1-NEXT: br i1 [[TMP12]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK1: .omp.reduction.then: +// CHECK1-NEXT: [[TMP13:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = load double, double* [[E1]], align 8 +// CHECK1-NEXT: [[ADD2:%.*]] = fadd double [[TMP13]], [[TMP14]] +// CHECK1-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK1: .omp.reduction.done: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK1-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK1-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK1-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK1-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1 +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1 +// CHECK1-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK1-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK1-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK1-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK1-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK1-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK1-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK1-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK1-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK1-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK1-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK1-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK1-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK1-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK1: then4: +// CHECK1-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 8 +// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8 +// CHECK1-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK1-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK1-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK1-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK1-NEXT: br label [[IFCONT6:%.*]] +// CHECK1: else5: +// CHECK1-NEXT: br label [[IFCONT6]] +// CHECK1: ifcont6: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK1-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK1-NEXT: br label [[PRECOND:%.*]] +// CHECK1: precond: +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK1-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK1: body: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK1-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK1: then4: +// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK1-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK1-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK1-NEXT: br label [[IFCONT6:%.*]] +// CHECK1: else5: +// CHECK1-NEXT: br label [[IFCONT6]] +// CHECK1: ifcont6: +// CHECK1-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK1-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK1-NEXT: br label [[PRECOND]] +// CHECK1: exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 +// CHECK1-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 +// CHECK1-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 +// CHECK1-SAME: (i64 [[C:%.*]], i64 [[D:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[C]], i64* [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[D]], i64* [[D_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[C_ADDR]] to i8* +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[D_ADDR]] to float* +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK1-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size2", align 8 +// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.2* +// CHECK1-NEXT: [[TMP10:%.*]] = load i8, i8* [[CONV]], align 8 +// CHECK1-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP9]], i32 0, i32 1 +// CHECK1-NEXT: store i8 [[TMP10]], i8* [[C8]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load float, float* [[CONV1]], align 8 +// CHECK1-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP9]], i32 0, i32 0 +// CHECK1-NEXT: store float [[TMP11]], float* [[D9]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: store i32 [[TMP12]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] +// CHECK1-NEXT: [[TMP13:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 +// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP13]]) +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca float*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 8 +// CHECK1-NEXT: store float* [[D]], float** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK1-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 +// CHECK1-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: store i8 0, i8* [[C1]], align 4 +// CHECK1-NEXT: store float 1.000000e+00, float* [[D2]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK1-NEXT: [[CONV:%.*]] = sext i8 [[TMP5]] to i32 +// CHECK1-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK1-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load float, float* [[D2]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = fmul float [[TMP6]], 3.300000e+01 +// CHECK1-NEXT: store float [[MUL]], float* [[D2]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: store i8* [[C1]], i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast float* [[D2]] to i8* +// CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP13]], i32 1024, i8* [[TMP12]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func5, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func6, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func8, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func9, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func10) +// CHECK1-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 1 +// CHECK1-NEXT: br i1 [[TMP15]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK1: .omp.reduction.then: +// CHECK1-NEXT: [[TMP16:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK1-NEXT: [[CONV4:%.*]] = sext i8 [[TMP16]] to i32 +// CHECK1-NEXT: [[TMP17:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK1-NEXT: [[CONV5:%.*]] = sext i8 [[TMP17]] to i32 +// CHECK1-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK1-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK1-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK1-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load float, float* [[D2]], align 4 +// CHECK1-NEXT: [[MUL8:%.*]] = fmul float [[TMP18]], [[TMP19]] +// CHECK1-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]]) +// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK1: .omp.reduction.done: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func5 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 +// CHECK1-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK1-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK1-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK1-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK1-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 +// CHECK1-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i64 1 +// CHECK1-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK1-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK1-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK1-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i64 1 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i64 1 +// CHECK1-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK1-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 8 +// CHECK1-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK1-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK1-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK1-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK1-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK1-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK1-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK1-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK1-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK1-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK1-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK1-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK1-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK1: then6: +// CHECK1-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 8 +// CHECK1-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 8 +// CHECK1-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK1-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK1-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 8 +// CHECK1-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 8 +// CHECK1-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK1-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK1-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK1-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK1-NEXT: br label [[IFCONT8:%.*]] +// CHECK1: else7: +// CHECK1-NEXT: br label [[IFCONT8]] +// CHECK1: ifcont8: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func6 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK1-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK1-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK1: then4: +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK1-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK1-NEXT: br label [[IFCONT6:%.*]] +// CHECK1: else5: +// CHECK1-NEXT: br label [[IFCONT6]] +// CHECK1: ifcont6: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK1: then8: +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK1-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK1-NEXT: br label [[IFCONT10:%.*]] +// CHECK1: else9: +// CHECK1-NEXT: br label [[IFCONT10]] +// CHECK1: ifcont10: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK1: then12: +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK1-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK1-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK1-NEXT: br label [[IFCONT14:%.*]] +// CHECK1: else13: +// CHECK1-NEXT: br label [[IFCONT14]] +// CHECK1: ifcont14: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func7 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 +// CHECK1-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 +// CHECK1-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func8 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func9 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 +// CHECK1-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 +// CHECK1-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func10 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 +// CHECK1-SAME: (i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i16* +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK1-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__11(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[CONV]], i16* [[CONV1]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 +// CHECK1-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK1-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK1-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK1-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__12 to i8*), i8* null, i8** [[TMP8]], i64 2) +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK1-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func17, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func18, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func20, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func21, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func22) +// CHECK1-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 +// CHECK1-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK1: .omp.reduction.then: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] +// CHECK1-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 +// CHECK1-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 +// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK1: .omp.reduction.done: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__12 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 +// CHECK1-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK1-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK1-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK1-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK1-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i64 16, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func14, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func15) +// CHECK1-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK1-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK1: .omp.reduction.then: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK1-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK1-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK1-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK1-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK1-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK1: cond.true9: +// CHECK1-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK1-NEXT: br label [[COND_END11:%.*]] +// CHECK1: cond.false10: +// CHECK1-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK1-NEXT: br label [[COND_END11]] +// CHECK1: cond.end11: +// CHECK1-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK1-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK1: .omp.reduction.done: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func14 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK1-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 +// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8 +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 +// CHECK1-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK1-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK1-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK1-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK1-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1 +// CHECK1-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK1-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK1-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK1-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK1-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK1-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK1-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK1-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK1-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK1-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK1-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK1-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK1-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func13"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK1-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK1: then6: +// CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 8 +// CHECK1-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8 +// CHECK1-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK1-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK1-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK1-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK1-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 8 +// CHECK1-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8 +// CHECK1-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK1-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK1-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK1-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK1-NEXT: br label [[IFCONT8:%.*]] +// CHECK1: else7: +// CHECK1-NEXT: br label [[IFCONT8]] +// CHECK1: ifcont8: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func15 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK1: then4: +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK1-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK1-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK1-NEXT: br label [[IFCONT6:%.*]] +// CHECK1: else5: +// CHECK1-NEXT: br label [[IFCONT6]] +// CHECK1: ifcont6: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK1: then8: +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK1-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK1-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK1-NEXT: br label [[IFCONT10:%.*]] +// CHECK1: else9: +// CHECK1-NEXT: br label [[IFCONT10]] +// CHECK1: ifcont10: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK1: then12: +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK1-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK1-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK1-NEXT: br label [[IFCONT14:%.*]] +// CHECK1: else13: +// CHECK1-NEXT: br label [[IFCONT14]] +// CHECK1: ifcont14: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func17 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK1-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 +// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8 +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 +// CHECK1-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK1-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK1-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK1-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK1-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1 +// CHECK1-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK1-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK1-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK1-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK1-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK1-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK1-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK1-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK1-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK1-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK1-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK1-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK1-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK1-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK1: then6: +// CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 8 +// CHECK1-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8 +// CHECK1-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK1-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK1-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK1-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK1-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 8 +// CHECK1-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8 +// CHECK1-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK1-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK1-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK1-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK1-NEXT: br label [[IFCONT8:%.*]] +// CHECK1: else7: +// CHECK1-NEXT: br label [[IFCONT8]] +// CHECK1: ifcont8: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func18 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK1: then4: +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK1-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK1-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK1-NEXT: br label [[IFCONT6:%.*]] +// CHECK1: else5: +// CHECK1-NEXT: br label [[IFCONT6]] +// CHECK1: ifcont6: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK1: then8: +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK1-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK1-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK1-NEXT: br label [[IFCONT10:%.*]] +// CHECK1: else9: +// CHECK1-NEXT: br label [[IFCONT10]] +// CHECK1: ifcont10: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK1: then12: +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK1-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK1-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK1-NEXT: br label [[IFCONT14:%.*]] +// CHECK1: else13: +// CHECK1-NEXT: br label [[IFCONT14]] +// CHECK1: ifcont14: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func19 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.5* +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK1-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 +// CHECK1-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func20 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.5* +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func21 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.5* +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 +// CHECK1-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 +// CHECK1-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func22 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.5* +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker +// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 +// CHECK2-SAME: (i64 [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[E_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i64 [[E]], i64* [[E_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[E_ADDR]] to double* +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1) +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK2-NEXT: [[TMP7:%.*]] = load double, double* [[CONV]], align 8 +// CHECK2-NEXT: [[E7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: store double [[TMP7]], double* [[E7]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] +// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[E_ADDR:%.*]] = alloca double*, align 8 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store double* [[E]], double** [[E_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1) +// CHECK2-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty.0* +// CHECK2-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP2]], i32 0, i32 0 +// CHECK2-NEXT: store double 0.000000e+00, double* [[E1]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load double, double* [[E1]], align 8 +// CHECK2-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 5.000000e+00 +// CHECK2-NEXT: store double [[ADD]], double* [[E1]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast double* [[E1]] to i8* +// CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 +// CHECK2-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP9]], i32 1024, i8* [[TMP8]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) +// CHECK2-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 +// CHECK2-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK2: .omp.reduction.then: +// CHECK2-NEXT: [[TMP12:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK2-NEXT: [[TMP13:%.*]] = load double, double* [[E1]], align 8 +// CHECK2-NEXT: [[ADD2:%.*]] = fadd double [[TMP12]], [[TMP13]] +// CHECK2-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) +// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK2: .omp.reduction.done: +// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 8 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i64 1 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK2-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK2-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK2-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK2-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK2-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1 +// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1 +// CHECK2-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK2-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 8 +// CHECK2-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK2-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK2-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK2-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK2-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK2-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK2-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK2-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK2-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK2-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK2-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK2-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK2-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK2: then4: +// CHECK2-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 8 +// CHECK2-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8 +// CHECK2-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK2-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK2-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK2-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK2-NEXT: br label [[IFCONT6:%.*]] +// CHECK2: else5: +// CHECK2-NEXT: br label [[IFCONT6]] +// CHECK2: ifcont6: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK2-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK2-NEXT: br label [[PRECOND:%.*]] +// CHECK2: precond: +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK2-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK2: body: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 8 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK2-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK2: then4: +// CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8 +// CHECK2-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK2-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK2-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK2-NEXT: br label [[IFCONT6:%.*]] +// CHECK2: else5: +// CHECK2-NEXT: br label [[IFCONT6]] +// CHECK2: ifcont6: +// CHECK2-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK2-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK2-NEXT: br label [[PRECOND]] +// CHECK2: exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 +// CHECK2-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 +// CHECK2-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker +// CHECK2-SAME: () #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 +// CHECK2-SAME: (i64 [[C:%.*]], i64 [[D:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i64 [[C]], i64* [[C_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[D]], i64* [[D_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[C_ADDR]] to i8* +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[D_ADDR]] to float* +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK2-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1) +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.2* +// CHECK2-NEXT: [[TMP7:%.*]] = load i8, i8* [[CONV]], align 8 +// CHECK2-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP6]], i32 0, i32 1 +// CHECK2-NEXT: store i8 [[TMP7]], i8* [[C8]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load float, float* [[CONV1]], align 8 +// CHECK2-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: store float [[TMP8]], float* [[D9]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] +// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca float*, align 8 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 8 +// CHECK2-NEXT: store float* [[D]], float** [[D_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1) +// CHECK2-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.3* +// CHECK2-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP3]], i32 0, i32 1 +// CHECK2-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP3]], i32 0, i32 0 +// CHECK2-NEXT: store i8 0, i8* [[C1]], align 4 +// CHECK2-NEXT: store float 1.000000e+00, float* [[D2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK2-NEXT: [[CONV:%.*]] = sext i8 [[TMP4]] to i32 +// CHECK2-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK2-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK2-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load float, float* [[D2]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 3.300000e+01 +// CHECK2-NEXT: store float [[MUL]], float* [[D2]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK2-NEXT: store i8* [[C1]], i8** [[TMP8]], align 8 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast float* [[D2]] to i8* +// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 +// CHECK2-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i8* [[TMP12]], i32 1024, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) +// CHECK2-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 +// CHECK2-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK2: .omp.reduction.then: +// CHECK2-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK2-NEXT: [[CONV4:%.*]] = sext i8 [[TMP15]] to i32 +// CHECK2-NEXT: [[TMP16:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK2-NEXT: [[CONV5:%.*]] = sext i8 [[TMP16]] to i32 +// CHECK2-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK2-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK2-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK2-NEXT: [[TMP17:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load float, float* [[D2]], align 4 +// CHECK2-NEXT: [[MUL8:%.*]] = fmul float [[TMP17]], [[TMP18]] +// CHECK2-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK2: .omp.reduction.done: +// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP2]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 +// CHECK2-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK2-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK2-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK2-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK2-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 +// CHECK2-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 8 +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 8 +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i64 1 +// CHECK2-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK2-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK2-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK2-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK2-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i64 1 +// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i64 1 +// CHECK2-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK2-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 8 +// CHECK2-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK2-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK2-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK2-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK2-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK2-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK2-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK2-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK2-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK2-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK2-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK2-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK2-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK2: then6: +// CHECK2-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 8 +// CHECK2-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 8 +// CHECK2-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK2-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK2-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 8 +// CHECK2-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 8 +// CHECK2-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK2-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK2-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK2-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK2-NEXT: br label [[IFCONT8:%.*]] +// CHECK2: else7: +// CHECK2-NEXT: br label [[IFCONT8]] +// CHECK2: ifcont8: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK2-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK2-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK2: then4: +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 +// CHECK2-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK2-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK2-NEXT: br label [[IFCONT6:%.*]] +// CHECK2: else5: +// CHECK2-NEXT: br label [[IFCONT6]] +// CHECK2: ifcont6: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK2: then8: +// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 +// CHECK2-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK2-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK2-NEXT: br label [[IFCONT10:%.*]] +// CHECK2: else9: +// CHECK2-NEXT: br label [[IFCONT10]] +// CHECK2: ifcont10: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK2: then12: +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 8 +// CHECK2-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK2-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK2-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK2-NEXT: br label [[IFCONT14:%.*]] +// CHECK2: else13: +// CHECK2-NEXT: br label [[IFCONT14]] +// CHECK2: ifcont14: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 +// CHECK2-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 +// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 +// CHECK2-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 +// CHECK2-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 +// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 +// CHECK2-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 +// CHECK2-SAME: (i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i16* +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK2-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[CONV]], i16* [[CONV1]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 +// CHECK2-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 +// CHECK2-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK2-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK2-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK2-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i64 2) +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK2-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 +// CHECK2-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) +// CHECK2-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 +// CHECK2-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK2: .omp.reduction.then: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] +// CHECK2-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 +// CHECK2-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 +// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK2: .omp.reduction.done: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 +// CHECK2-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 +// CHECK2-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK2-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK2-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK2-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK2-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i64 16, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) +// CHECK2-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK2-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK2: .omp.reduction.then: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK2-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK2-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK2-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK2-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK2-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK2-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK2: cond.true9: +// CHECK2-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK2-NEXT: br label [[COND_END11:%.*]] +// CHECK2: cond.false10: +// CHECK2-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK2-NEXT: br label [[COND_END11]] +// CHECK2: cond.end11: +// CHECK2-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK2-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK2: .omp.reduction.done: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK2-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK2-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 +// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK2-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 8 +// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8 +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 +// CHECK2-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK2-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK2-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK2-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK2-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 +// CHECK2-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1 +// CHECK2-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK2-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 8 +// CHECK2-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK2-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK2-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK2-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK2-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK2-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK2-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK2-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK2-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK2-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK2-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK2-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK2-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK2: then6: +// CHECK2-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 8 +// CHECK2-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8 +// CHECK2-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK2-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK2-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK2-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK2-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 8 +// CHECK2-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8 +// CHECK2-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK2-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK2-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK2-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK2-NEXT: br label [[IFCONT8:%.*]] +// CHECK2: else7: +// CHECK2-NEXT: br label [[IFCONT8]] +// CHECK2: ifcont8: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK2: then4: +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK2-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK2-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK2-NEXT: br label [[IFCONT6:%.*]] +// CHECK2: else5: +// CHECK2-NEXT: br label [[IFCONT6]] +// CHECK2: ifcont6: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK2: then8: +// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 +// CHECK2-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK2-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK2-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK2-NEXT: br label [[IFCONT10:%.*]] +// CHECK2: else9: +// CHECK2-NEXT: br label [[IFCONT10]] +// CHECK2: ifcont10: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK2: then12: +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 8 +// CHECK2-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK2-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK2-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK2-NEXT: br label [[IFCONT14:%.*]] +// CHECK2: else13: +// CHECK2-NEXT: br label [[IFCONT14]] +// CHECK2: ifcont14: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK2-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK2-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 +// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK2-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 8 +// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8 +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 +// CHECK2-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK2-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK2-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK2-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK2-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 +// CHECK2-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1 +// CHECK2-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK2-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 8 +// CHECK2-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK2-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK2-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK2-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK2-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK2-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK2-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK2-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK2-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK2-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK2-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK2-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK2-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK2: then6: +// CHECK2-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 8 +// CHECK2-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8 +// CHECK2-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK2-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK2-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK2-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK2-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 8 +// CHECK2-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8 +// CHECK2-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK2-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK2-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK2-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK2-NEXT: br label [[IFCONT8:%.*]] +// CHECK2: else7: +// CHECK2-NEXT: br label [[IFCONT8]] +// CHECK2: ifcont8: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK2: then4: +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK2-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK2-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK2-NEXT: br label [[IFCONT6:%.*]] +// CHECK2: else5: +// CHECK2-NEXT: br label [[IFCONT6]] +// CHECK2: ifcont6: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK2: then8: +// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 +// CHECK2-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK2-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK2-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK2-NEXT: br label [[IFCONT10:%.*]] +// CHECK2: else9: +// CHECK2-NEXT: br label [[IFCONT10]] +// CHECK2: ifcont10: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK2: then12: +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 8 +// CHECK2-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK2-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK2-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK2-NEXT: br label [[IFCONT14:%.*]] +// CHECK2: else13: +// CHECK2-NEXT: br label [[IFCONT14]] +// CHECK2: ifcont14: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.5* +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK2-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 +// CHECK2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 +// CHECK2-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.5* +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.5* +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 +// CHECK2-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 +// CHECK2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 +// CHECK2-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.5* +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker +// CHECK3-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 +// CHECK3-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK3-NEXT: [[E7:%.*]] = alloca double, align 8 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK3-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK3-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK3-NEXT: store double [[TMP7]], double* [[E7]], align 8 +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK3-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: store double 0.000000e+00, double* [[E1]], align 8 +// CHECK3-NEXT: [[TMP6:%.*]] = load double, double* [[E1]], align 8 +// CHECK3-NEXT: [[ADD:%.*]] = fadd double [[TMP6]], 5.000000e+00 +// CHECK3-NEXT: store double [[ADD]], double* [[E1]], align 8 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast double* [[E1]] to i8* +// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP12]], i32 1024, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) +// CHECK3-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 +// CHECK3-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK3: .omp.reduction.then: +// CHECK3-NEXT: [[TMP15:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK3-NEXT: [[TMP16:%.*]] = load double, double* [[E1]], align 8 +// CHECK3-NEXT: [[ADD2:%.*]] = fadd double [[TMP15]], [[TMP16]] +// CHECK3-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]]) +// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK3: .omp.reduction.done: +// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK3-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK3-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK3-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK3-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK3-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 +// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 +// CHECK3-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK3-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK3-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK3-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK3-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK3-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK3-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK3-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK3-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK3-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK3-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK3-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK3-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK3-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK3: then4: +// CHECK3-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 +// CHECK3-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 +// CHECK3-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK3-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK3-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK3-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK3-NEXT: br label [[IFCONT6:%.*]] +// CHECK3: else5: +// CHECK3-NEXT: br label [[IFCONT6]] +// CHECK3: ifcont6: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK3-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK3-NEXT: br label [[PRECOND:%.*]] +// CHECK3: precond: +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK3-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK3: body: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK3-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK3: then4: +// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK3-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK3-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK3-NEXT: br label [[IFCONT6:%.*]] +// CHECK3: else5: +// CHECK3-NEXT: br label [[IFCONT6]] +// CHECK3: ifcont6: +// CHECK3-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK3-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK3-NEXT: br label [[PRECOND]] +// CHECK3: exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 +// CHECK3-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 +// CHECK3-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker +// CHECK3-SAME: () #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 +// CHECK3-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* +// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK3-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size2", align 4 +// CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK3-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.1* +// CHECK3-NEXT: [[TMP10:%.*]] = load i8, i8* [[CONV]], align 4 +// CHECK3-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP9]], i32 0, i32 1 +// CHECK3-NEXT: store i8 [[TMP10]], i8* [[C8]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load float, float* [[CONV1]], align 4 +// CHECK3-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP9]], i32 0, i32 0 +// CHECK3-NEXT: store float [[TMP11]], float* [[D9]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: store i32 [[TMP12]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] +// CHECK3-NEXT: [[TMP13:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 +// CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP13]]) +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK3-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 8 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.2* +// CHECK3-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: store i8 0, i8* [[C1]], align 4 +// CHECK3-NEXT: store float 1.000000e+00, float* [[D2]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = sext i8 [[TMP5]] to i32 +// CHECK3-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK3-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK3-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load float, float* [[D2]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = fmul float [[TMP6]], 3.300000e+01 +// CHECK3-NEXT: store float [[MUL]], float* [[D2]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: store i8* [[C1]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast float* [[D2]] to i8* +// CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP13]], i32 1024, i8* [[TMP12]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func5, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func6, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func8, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func9, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func10) +// CHECK3-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 1 +// CHECK3-NEXT: br i1 [[TMP15]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK3: .omp.reduction.then: +// CHECK3-NEXT: [[TMP16:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK3-NEXT: [[CONV4:%.*]] = sext i8 [[TMP16]] to i32 +// CHECK3-NEXT: [[TMP17:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK3-NEXT: [[CONV5:%.*]] = sext i8 [[TMP17]] to i32 +// CHECK3-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK3-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK3-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK3-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load float, float* [[D2]], align 4 +// CHECK3-NEXT: [[MUL8:%.*]] = fmul float [[TMP18]], [[TMP19]] +// CHECK3-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]]) +// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK3: .omp.reduction.done: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func5 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK3-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK3-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK3-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK3-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK3-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK3-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 +// CHECK3-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK3-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK3-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK3-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 +// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 +// CHECK3-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK3-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK3-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK3-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK3-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK3-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK3-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK3-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK3-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK3-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK3-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK3-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK3-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK3-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK3: then6: +// CHECK3-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 +// CHECK3-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 +// CHECK3-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK3-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK3-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 +// CHECK3-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 +// CHECK3-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK3-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK3-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK3-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK3-NEXT: br label [[IFCONT8:%.*]] +// CHECK3: else7: +// CHECK3-NEXT: br label [[IFCONT8]] +// CHECK3: ifcont8: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func6 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK3-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK3-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK3: then4: +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK3-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK3-NEXT: br label [[IFCONT6:%.*]] +// CHECK3: else5: +// CHECK3-NEXT: br label [[IFCONT6]] +// CHECK3: ifcont6: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK3: then8: +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK3-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK3-NEXT: br label [[IFCONT10:%.*]] +// CHECK3: else9: +// CHECK3-NEXT: br label [[IFCONT10]] +// CHECK3: ifcont10: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK3: then12: +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK3-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK3-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK3-NEXT: br label [[IFCONT14:%.*]] +// CHECK3: else13: +// CHECK3-NEXT: br label [[IFCONT14]] +// CHECK3: ifcont14: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func7 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 +// CHECK3-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 +// CHECK3-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func8 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func9 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 +// CHECK3-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 +// CHECK3-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func10 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 +// CHECK3-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK3-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__11(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK3-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK3-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK3-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__12 to i8*), i8* null, i8** [[TMP8]], i32 2) +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func17, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func18, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func20, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func21, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func22) +// CHECK3-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 +// CHECK3-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK3: .omp.reduction.then: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] +// CHECK3-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 +// CHECK3-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 +// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK3: .omp.reduction.done: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__12 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK3-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK3-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK3-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK3-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK3-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func14, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func15) +// CHECK3-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK3-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK3: .omp.reduction.then: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK3-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK3-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK3-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK3-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK3-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK3-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK3: cond.true9: +// CHECK3-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK3-NEXT: br label [[COND_END11:%.*]] +// CHECK3: cond.false10: +// CHECK3-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK3-NEXT: br label [[COND_END11]] +// CHECK3: cond.end11: +// CHECK3-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK3-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK3: .omp.reduction.done: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func14 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK3-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK3-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK3-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK3-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK3-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK3-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK3-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK3-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK3-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK3-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK3-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK3-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK3-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK3-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK3-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK3-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK3-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK3-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK3-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK3-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK3-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func13"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK3-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK3: then6: +// CHECK3-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK3-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK3-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK3-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK3-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK3-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK3-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK3-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK3-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK3-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK3-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK3-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK3-NEXT: br label [[IFCONT8:%.*]] +// CHECK3: else7: +// CHECK3-NEXT: br label [[IFCONT8]] +// CHECK3: ifcont8: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func15 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK3: then4: +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK3-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK3-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK3-NEXT: br label [[IFCONT6:%.*]] +// CHECK3: else5: +// CHECK3-NEXT: br label [[IFCONT6]] +// CHECK3: ifcont6: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK3: then8: +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK3-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK3-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK3-NEXT: br label [[IFCONT10:%.*]] +// CHECK3: else9: +// CHECK3-NEXT: br label [[IFCONT10]] +// CHECK3: ifcont10: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK3: then12: +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK3-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK3-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK3-NEXT: br label [[IFCONT14:%.*]] +// CHECK3: else13: +// CHECK3-NEXT: br label [[IFCONT14]] +// CHECK3: ifcont14: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func17 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK3-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK3-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK3-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK3-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK3-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK3-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK3-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK3-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK3-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK3-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK3-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK3-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK3-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK3-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK3-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK3-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK3-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK3-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK3-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK3-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK3-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK3-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK3: then6: +// CHECK3-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK3-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK3-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK3-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK3-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK3-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK3-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK3-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK3-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK3-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK3-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK3-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK3-NEXT: br label [[IFCONT8:%.*]] +// CHECK3: else7: +// CHECK3-NEXT: br label [[IFCONT8]] +// CHECK3: ifcont8: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func18 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK3: then4: +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK3-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK3-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK3-NEXT: br label [[IFCONT6:%.*]] +// CHECK3: else5: +// CHECK3-NEXT: br label [[IFCONT6]] +// CHECK3: ifcont6: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK3: then8: +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK3-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK3-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK3-NEXT: br label [[IFCONT10:%.*]] +// CHECK3: else9: +// CHECK3-NEXT: br label [[IFCONT10]] +// CHECK3: ifcont10: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK3: then12: +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK3-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK3-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK3-NEXT: br label [[IFCONT14:%.*]] +// CHECK3: else13: +// CHECK3-NEXT: br label [[IFCONT14]] +// CHECK3: ifcont14: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func19 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK3-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 +// CHECK3-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func20 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func21 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 +// CHECK3-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 +// CHECK3-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func22 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK3-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker +// CHECK4-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK4: .await.work: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK4: .select.workers: +// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK4: .execute.parallel: +// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK4-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK4: .terminate.parallel: +// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK4: .barrier.parallel: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 +// CHECK4-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK4-NEXT: [[E7:%.*]] = alloca double, align 8 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK4-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK4-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK4: .worker: +// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .mastercheck: +// CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK4-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK4-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK4-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK4-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK4: .master: +// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK4-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK4-NEXT: store double [[TMP7]], double* [[E7]], align 8 +// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] +// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK4: .termination.notifier: +// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTEXIT]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK4-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: store double 0.000000e+00, double* [[E1]], align 8 +// CHECK4-NEXT: [[TMP6:%.*]] = load double, double* [[E1]], align 8 +// CHECK4-NEXT: [[ADD:%.*]] = fadd double [[TMP6]], 5.000000e+00 +// CHECK4-NEXT: store double [[ADD]], double* [[E1]], align 8 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast double* [[E1]] to i8* +// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP12]], i32 2048, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) +// CHECK4-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 +// CHECK4-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK4: .omp.reduction.then: +// CHECK4-NEXT: [[TMP15:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK4-NEXT: [[TMP16:%.*]] = load double, double* [[E1]], align 8 +// CHECK4-NEXT: [[ADD2:%.*]] = fadd double [[TMP15]], [[TMP16]] +// CHECK4-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK4-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]]) +// CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK4: .omp.reduction.done: +// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]]) +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK4-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK4-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK4-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK4-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK4-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK4-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK4-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK4-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK4-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK4-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK4-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 +// CHECK4-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 +// CHECK4-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK4-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK4-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK4-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK4-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK4-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK4-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK4-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK4-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK4-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK4-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK4-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK4-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK4: then: +// CHECK4-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK4-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] +// CHECK4-NEXT: br label [[IFCONT:%.*]] +// CHECK4: else: +// CHECK4-NEXT: br label [[IFCONT]] +// CHECK4: ifcont: +// CHECK4-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK4-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK4-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK4: then4: +// CHECK4-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 +// CHECK4-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 +// CHECK4-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK4-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK4-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK4-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK4-NEXT: br label [[IFCONT6:%.*]] +// CHECK4: else5: +// CHECK4-NEXT: br label [[IFCONT6]] +// CHECK4: ifcont6: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK4-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK4-NEXT: br label [[PRECOND:%.*]] +// CHECK4: precond: +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK4-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK4: body: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) +// CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK4: then: +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK4-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK4-NEXT: br label [[IFCONT:%.*]] +// CHECK4: else: +// CHECK4-NEXT: br label [[IFCONT]] +// CHECK4: ifcont: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK4: then4: +// CHECK4-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK4-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK4-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK4-NEXT: br label [[IFCONT6:%.*]] +// CHECK4: else5: +// CHECK4-NEXT: br label [[IFCONT6]] +// CHECK4: ifcont6: +// CHECK4-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK4-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK4-NEXT: br label [[PRECOND]] +// CHECK4: exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 +// CHECK4-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 +// CHECK4-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker +// CHECK4-SAME: () #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK4: .await.work: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK4: .select.workers: +// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK4: .execute.parallel: +// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK4-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK4: .terminate.parallel: +// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK4: .barrier.parallel: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 +// CHECK4-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* +// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK4: .worker: +// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .mastercheck: +// CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK4: .master: +// CHECK4-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK4-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size2", align 4 +// CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK4-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 +// CHECK4-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.1* +// CHECK4-NEXT: [[TMP10:%.*]] = load i8, i8* [[CONV]], align 4 +// CHECK4-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP9]], i32 0, i32 1 +// CHECK4-NEXT: store i8 [[TMP10]], i8* [[C8]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load float, float* [[CONV1]], align 4 +// CHECK4-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP9]], i32 0, i32 0 +// CHECK4-NEXT: store float [[TMP11]], float* [[D9]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: store i32 [[TMP12]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] +// CHECK4-NEXT: [[TMP13:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 +// CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP13]]) +// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK4: .termination.notifier: +// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTEXIT]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK4-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 8 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.2* +// CHECK4-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: store i8 0, i8* [[C1]], align 4 +// CHECK4-NEXT: store float 1.000000e+00, float* [[D2]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK4-NEXT: [[CONV:%.*]] = sext i8 [[TMP5]] to i32 +// CHECK4-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK4-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK4-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load float, float* [[D2]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = fmul float [[TMP6]], 3.300000e+01 +// CHECK4-NEXT: store float [[MUL]], float* [[D2]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK4-NEXT: store i8* [[C1]], i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast float* [[D2]] to i8* +// CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP13]], i32 2048, i8* [[TMP12]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func5, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func6, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func8, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func9, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func10) +// CHECK4-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 1 +// CHECK4-NEXT: br i1 [[TMP15]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK4: .omp.reduction.then: +// CHECK4-NEXT: [[TMP16:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK4-NEXT: [[CONV4:%.*]] = sext i8 [[TMP16]] to i32 +// CHECK4-NEXT: [[TMP17:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK4-NEXT: [[CONV5:%.*]] = sext i8 [[TMP17]] to i32 +// CHECK4-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK4-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK4-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK4-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = load float, float* [[D2]], align 4 +// CHECK4-NEXT: [[MUL8:%.*]] = fmul float [[TMP18]], [[TMP19]] +// CHECK4-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK4-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]]) +// CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK4: .omp.reduction.done: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func5 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK4-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK4-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK4-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK4-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK4-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK4-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK4-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK4-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK4-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK4-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK4-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK4-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK4-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 +// CHECK4-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK4-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK4-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK4-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK4-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK4-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 +// CHECK4-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 +// CHECK4-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK4-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 +// CHECK4-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK4-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK4-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK4-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK4-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK4-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK4-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK4-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK4-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK4-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK4-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK4-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK4: then: +// CHECK4-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK4-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] +// CHECK4-NEXT: br label [[IFCONT:%.*]] +// CHECK4: else: +// CHECK4-NEXT: br label [[IFCONT]] +// CHECK4: ifcont: +// CHECK4-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK4-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK4-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK4: then6: +// CHECK4-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 +// CHECK4-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 +// CHECK4-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK4-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK4-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 +// CHECK4-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 +// CHECK4-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK4-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK4-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK4-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK4-NEXT: br label [[IFCONT8:%.*]] +// CHECK4: else7: +// CHECK4-NEXT: br label [[IFCONT8]] +// CHECK4: ifcont8: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func6 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK4: then: +// CHECK4-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK4-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK4-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK4-NEXT: br label [[IFCONT:%.*]] +// CHECK4: else: +// CHECK4-NEXT: br label [[IFCONT]] +// CHECK4: ifcont: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK4: then4: +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK4-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK4-NEXT: br label [[IFCONT6:%.*]] +// CHECK4: else5: +// CHECK4-NEXT: br label [[IFCONT6]] +// CHECK4: ifcont6: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK4-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK4-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK4: then8: +// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK4-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK4-NEXT: br label [[IFCONT10:%.*]] +// CHECK4: else9: +// CHECK4-NEXT: br label [[IFCONT10]] +// CHECK4: ifcont10: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK4: then12: +// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK4-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK4-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK4-NEXT: br label [[IFCONT14:%.*]] +// CHECK4: else13: +// CHECK4-NEXT: br label [[IFCONT14]] +// CHECK4: ifcont14: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func7 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 +// CHECK4-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 +// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 +// CHECK4-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func8 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func9 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 +// CHECK4-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 +// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 +// CHECK4-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func10 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 +// CHECK4-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK4-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__11(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK4-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK4-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK4-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK4-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK4-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__12 to i8*), i8* null, i8** [[TMP8]], i32 2) +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK4-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 2048, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func17, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func18, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func20, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func21, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func22) +// CHECK4-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 +// CHECK4-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK4: .omp.reduction.then: +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK4-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] +// CHECK4-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 +// CHECK4-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK4-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 +// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 +// CHECK4-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK4: .omp.reduction.done: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__12 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK4-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK4-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK4-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK4-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK4-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK4-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK4-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func14, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func15) +// CHECK4-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK4-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK4: .omp.reduction.then: +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK4-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK4-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK4-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK4-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK4-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK4-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK4: cond.true9: +// CHECK4-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK4-NEXT: br label [[COND_END11:%.*]] +// CHECK4: cond.false10: +// CHECK4-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK4-NEXT: br label [[COND_END11]] +// CHECK4: cond.end11: +// CHECK4-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK4-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK4-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK4: .omp.reduction.done: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func14 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK4-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK4-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK4-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK4-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK4-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK4-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK4-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK4-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK4-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK4-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK4-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK4-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK4-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK4-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK4-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK4-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK4-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK4-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK4-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK4-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK4-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK4-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK4-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK4-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK4-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK4-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK4-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK4-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK4-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK4-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK4-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK4: then: +// CHECK4-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK4-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func13"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK4-NEXT: br label [[IFCONT:%.*]] +// CHECK4: else: +// CHECK4-NEXT: br label [[IFCONT]] +// CHECK4: ifcont: +// CHECK4-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK4-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK4-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK4: then6: +// CHECK4-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK4-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK4-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK4-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK4-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK4-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK4-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK4-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK4-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK4-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK4-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK4-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK4-NEXT: br label [[IFCONT8:%.*]] +// CHECK4: else7: +// CHECK4-NEXT: br label [[IFCONT8]] +// CHECK4: ifcont8: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func15 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) +// CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK4: then: +// CHECK4-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK4-NEXT: br label [[IFCONT:%.*]] +// CHECK4: else: +// CHECK4-NEXT: br label [[IFCONT]] +// CHECK4: ifcont: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK4: then4: +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK4-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK4-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK4-NEXT: br label [[IFCONT6:%.*]] +// CHECK4: else5: +// CHECK4-NEXT: br label [[IFCONT6]] +// CHECK4: ifcont6: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK4-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK4-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK4: then8: +// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK4-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK4-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK4-NEXT: br label [[IFCONT10:%.*]] +// CHECK4: else9: +// CHECK4-NEXT: br label [[IFCONT10]] +// CHECK4: ifcont10: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK4: then12: +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK4-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK4-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK4-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK4-NEXT: br label [[IFCONT14:%.*]] +// CHECK4: else13: +// CHECK4-NEXT: br label [[IFCONT14]] +// CHECK4: ifcont14: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func17 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK4-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK4-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK4-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK4-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK4-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK4-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK4-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK4-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK4-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK4-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK4-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK4-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK4-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK4-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK4-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK4-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK4-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK4-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK4-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK4-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK4-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK4-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK4-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK4-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK4-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK4-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK4-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK4-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK4-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK4-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK4-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK4: then: +// CHECK4-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK4-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK4-NEXT: br label [[IFCONT:%.*]] +// CHECK4: else: +// CHECK4-NEXT: br label [[IFCONT]] +// CHECK4: ifcont: +// CHECK4-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK4-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK4-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK4: then6: +// CHECK4-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK4-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK4-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK4-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK4-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK4-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK4-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK4-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK4-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK4-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK4-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK4-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK4-NEXT: br label [[IFCONT8:%.*]] +// CHECK4: else7: +// CHECK4-NEXT: br label [[IFCONT8]] +// CHECK4: ifcont8: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func18 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK4: then: +// CHECK4-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK4-NEXT: br label [[IFCONT:%.*]] +// CHECK4: else: +// CHECK4-NEXT: br label [[IFCONT]] +// CHECK4: ifcont: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK4: then4: +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK4-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK4-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK4-NEXT: br label [[IFCONT6:%.*]] +// CHECK4: else5: +// CHECK4-NEXT: br label [[IFCONT6]] +// CHECK4: ifcont6: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK4-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK4-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK4: then8: +// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK4-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK4-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK4-NEXT: br label [[IFCONT10:%.*]] +// CHECK4: else9: +// CHECK4-NEXT: br label [[IFCONT10]] +// CHECK4: ifcont10: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK4: then12: +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK4-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK4-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK4-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK4-NEXT: br label [[IFCONT14:%.*]] +// CHECK4: else13: +// CHECK4-NEXT: br label [[IFCONT14]] +// CHECK4: ifcont14: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func19 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK4-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 +// CHECK4-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func20 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func21 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 +// CHECK4-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 +// CHECK4-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func22 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK4-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker +// CHECK5-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK5: .await.work: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK5: .select.workers: +// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK5: .execute.parallel: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK5-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK5: .terminate.parallel: +// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK5: .barrier.parallel: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 +// CHECK5-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK5-NEXT: [[E7:%.*]] = alloca double, align 8 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK5-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK5-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK5: .worker: +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .mastercheck: +// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK5-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK5-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK5-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK5: .master: +// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK5-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK5-NEXT: store double [[TMP7]], double* [[E7]], align 8 +// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK5: .termination.notifier: +// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTEXIT]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) +// CHECK5-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK5-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK5-NEXT: store double 0.000000e+00, double* [[E1]], align 8 +// CHECK5-NEXT: [[TMP3:%.*]] = load double, double* [[E1]], align 8 +// CHECK5-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 5.000000e+00 +// CHECK5-NEXT: store double [[ADD]], double* [[E1]], align 8 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = bitcast double* [[E1]] to i8* +// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP9]], i32 1024, i8* [[TMP8]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) +// CHECK5-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 +// CHECK5-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK5: .omp.reduction.then: +// CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK5-NEXT: [[TMP13:%.*]] = load double, double* [[E1]], align 8 +// CHECK5-NEXT: [[ADD2:%.*]] = fadd double [[TMP12]], [[TMP13]] +// CHECK5-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) +// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK5: .omp.reduction.done: +// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK5-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK5-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK5-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK5-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK5-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 +// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 +// CHECK5-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK5-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK5-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK5-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK5-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK5-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK5-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK5-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK5-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK5-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK5-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK5-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK5-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK5-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK5: then4: +// CHECK5-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 +// CHECK5-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 +// CHECK5-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK5-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK5-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK5-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK5-NEXT: br label [[IFCONT6:%.*]] +// CHECK5: else5: +// CHECK5-NEXT: br label [[IFCONT6]] +// CHECK5: ifcont6: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK5-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK5-NEXT: br label [[PRECOND:%.*]] +// CHECK5: precond: +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK5-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK5: body: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK5-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK5: then4: +// CHECK5-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK5-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK5-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK5-NEXT: br label [[IFCONT6:%.*]] +// CHECK5: else5: +// CHECK5-NEXT: br label [[IFCONT6]] +// CHECK5: ifcont6: +// CHECK5-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK5-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK5-NEXT: br label [[PRECOND]] +// CHECK5: exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 +// CHECK5-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 +// CHECK5-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker +// CHECK5-SAME: () #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK5: .await.work: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK5: .select.workers: +// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK5: .execute.parallel: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK5-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK5: .terminate.parallel: +// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK5: .barrier.parallel: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 +// CHECK5-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* +// CHECK5-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK5: .worker: +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .mastercheck: +// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK5: .master: +// CHECK5-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK5-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK5-NEXT: [[TMP7:%.*]] = load i8, i8* [[CONV]], align 4 +// CHECK5-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 +// CHECK5-NEXT: store i8 [[TMP7]], i8* [[C8]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load float, float* [[CONV1]], align 4 +// CHECK5-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: store float [[TMP8]], float* [[D9]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] +// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK5: .termination.notifier: +// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTEXIT]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK5-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) +// CHECK5-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.2* +// CHECK5-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 1 +// CHECK5-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 0 +// CHECK5-NEXT: store i8 0, i8* [[C1]], align 4 +// CHECK5-NEXT: store float 1.000000e+00, float* [[D2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK5-NEXT: [[CONV:%.*]] = sext i8 [[TMP4]] to i32 +// CHECK5-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK5-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK5-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load float, float* [[D2]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 3.300000e+01 +// CHECK5-NEXT: store float [[MUL]], float* [[D2]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: store i8* [[C1]], i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[D2]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i8* [[TMP12]], i32 1024, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) +// CHECK5-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 +// CHECK5-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK5: .omp.reduction.then: +// CHECK5-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK5-NEXT: [[CONV4:%.*]] = sext i8 [[TMP15]] to i32 +// CHECK5-NEXT: [[TMP16:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK5-NEXT: [[CONV5:%.*]] = sext i8 [[TMP16]] to i32 +// CHECK5-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK5-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK5-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK5-NEXT: [[TMP17:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load float, float* [[D2]], align 4 +// CHECK5-NEXT: [[MUL8:%.*]] = fmul float [[TMP17]], [[TMP18]] +// CHECK5-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK5: .omp.reduction.done: +// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP2]]) +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK5-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK5-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK5-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK5-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK5-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK5-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK5-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 +// CHECK5-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK5-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK5-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK5-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK5-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 +// CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 +// CHECK5-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK5-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 +// CHECK5-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK5-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK5-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK5-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK5-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK5-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK5-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK5-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK5-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK5-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK5: then6: +// CHECK5-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 +// CHECK5-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 +// CHECK5-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK5-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK5-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 +// CHECK5-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 +// CHECK5-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK5-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK5-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK5-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK5-NEXT: br label [[IFCONT8:%.*]] +// CHECK5: else7: +// CHECK5-NEXT: br label [[IFCONT8]] +// CHECK5: ifcont8: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK5-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK5-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK5: then4: +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK5-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK5-NEXT: br label [[IFCONT6:%.*]] +// CHECK5: else5: +// CHECK5-NEXT: br label [[IFCONT6]] +// CHECK5: ifcont6: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK5: then8: +// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK5-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK5-NEXT: br label [[IFCONT10:%.*]] +// CHECK5: else9: +// CHECK5-NEXT: br label [[IFCONT10]] +// CHECK5: ifcont10: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK5: then12: +// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK5-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK5-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK5-NEXT: br label [[IFCONT14:%.*]] +// CHECK5: else13: +// CHECK5-NEXT: br label [[IFCONT14]] +// CHECK5: ifcont14: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 +// CHECK5-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 +// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 +// CHECK5-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 +// CHECK5-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 +// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 +// CHECK5-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 +// CHECK5-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK5-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK5-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK5-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK5-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK5-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK5-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2) +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK5-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) +// CHECK5-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 +// CHECK5-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK5: .omp.reduction.then: +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK5-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] +// CHECK5-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 +// CHECK5-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK5-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 +// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK5: .omp.reduction.done: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK5-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK5-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK5-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK5-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK5-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK5-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK5-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) +// CHECK5-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK5-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK5: .omp.reduction.then: +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK5-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK5-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK5-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK5-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK5-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK5-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK5: cond.true9: +// CHECK5-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK5-NEXT: br label [[COND_END11:%.*]] +// CHECK5: cond.false10: +// CHECK5-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK5-NEXT: br label [[COND_END11]] +// CHECK5: cond.end11: +// CHECK5-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK5-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK5: .omp.reduction.done: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK5-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK5-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK5-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK5-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK5-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK5-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK5-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK5-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK5-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK5-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK5-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK5-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK5-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK5-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK5-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK5-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK5-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK5-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK5-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK5-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK5-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK5: then6: +// CHECK5-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK5-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK5-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK5-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK5-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK5-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK5-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK5-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK5-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK5-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK5-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK5-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK5-NEXT: br label [[IFCONT8:%.*]] +// CHECK5: else7: +// CHECK5-NEXT: br label [[IFCONT8]] +// CHECK5: ifcont8: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK5-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK5: then4: +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK5-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK5-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK5-NEXT: br label [[IFCONT6:%.*]] +// CHECK5: else5: +// CHECK5-NEXT: br label [[IFCONT6]] +// CHECK5: ifcont6: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK5: then8: +// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK5-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK5-NEXT: br label [[IFCONT10:%.*]] +// CHECK5: else9: +// CHECK5-NEXT: br label [[IFCONT10]] +// CHECK5: ifcont10: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK5: then12: +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK5-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK5-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK5-NEXT: br label [[IFCONT14:%.*]] +// CHECK5: else13: +// CHECK5-NEXT: br label [[IFCONT14]] +// CHECK5: ifcont14: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK5-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK5-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK5-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK5-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK5-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK5-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK5-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK5-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK5-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK5-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK5-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK5-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK5-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK5-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK5-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK5-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK5-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK5-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK5-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK5-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK5-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK5: then6: +// CHECK5-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK5-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK5-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK5-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK5-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK5-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK5-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK5-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK5-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK5-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK5-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK5-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK5-NEXT: br label [[IFCONT8:%.*]] +// CHECK5: else7: +// CHECK5-NEXT: br label [[IFCONT8]] +// CHECK5: ifcont8: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK5-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK5: then4: +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK5-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK5-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK5-NEXT: br label [[IFCONT6:%.*]] +// CHECK5: else5: +// CHECK5-NEXT: br label [[IFCONT6]] +// CHECK5: ifcont6: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK5: then8: +// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK5-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK5-NEXT: br label [[IFCONT10:%.*]] +// CHECK5: else9: +// CHECK5-NEXT: br label [[IFCONT10]] +// CHECK5: ifcont10: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK5: then12: +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK5-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK5-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK5-NEXT: br label [[IFCONT14:%.*]] +// CHECK5: else13: +// CHECK5-NEXT: br label [[IFCONT14]] +// CHECK5: ifcont14: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK5-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 +// CHECK5-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 +// CHECK5-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 +// CHECK5-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker +// CHECK6-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK6: .await.work: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK6: .select.workers: +// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK6: .execute.parallel: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK6: .terminate.parallel: +// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK6: .barrier.parallel: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 +// CHECK6-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK6-NEXT: [[E7:%.*]] = alloca double, align 8 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK6-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK6-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK6: .worker: +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .mastercheck: +// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK6-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK6-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK6-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK6-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK6: .master: +// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK6-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK6-NEXT: store double [[TMP7]], double* [[E7]], align 8 +// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK6: .termination.notifier: +// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTEXIT]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) +// CHECK6-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK6-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK6-NEXT: store double 0.000000e+00, double* [[E1]], align 8 +// CHECK6-NEXT: [[TMP3:%.*]] = load double, double* [[E1]], align 8 +// CHECK6-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 5.000000e+00 +// CHECK6-NEXT: store double [[ADD]], double* [[E1]], align 8 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = bitcast double* [[E1]] to i8* +// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP9]], i32 2048, i8* [[TMP8]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) +// CHECK6-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 +// CHECK6-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK6: .omp.reduction.then: +// CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK6-NEXT: [[TMP13:%.*]] = load double, double* [[E1]], align 8 +// CHECK6-NEXT: [[ADD2:%.*]] = fadd double [[TMP12]], [[TMP13]] +// CHECK6-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) +// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK6: .omp.reduction.done: +// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK6-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK6-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK6-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK6-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK6-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 +// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 +// CHECK6-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK6-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK6-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK6-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK6-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK6-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK6-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK6-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK6-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK6-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK6-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK6-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK6-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK6-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK6: then4: +// CHECK6-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 +// CHECK6-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 +// CHECK6-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK6-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK6-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK6-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK6-NEXT: br label [[IFCONT6:%.*]] +// CHECK6: else5: +// CHECK6-NEXT: br label [[IFCONT6]] +// CHECK6: ifcont6: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK6-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK6-NEXT: br label [[PRECOND:%.*]] +// CHECK6: precond: +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK6-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK6: body: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK6-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK6: then4: +// CHECK6-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK6-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK6-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK6-NEXT: br label [[IFCONT6:%.*]] +// CHECK6: else5: +// CHECK6-NEXT: br label [[IFCONT6]] +// CHECK6: ifcont6: +// CHECK6-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK6-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK6-NEXT: br label [[PRECOND]] +// CHECK6: exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 +// CHECK6-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 +// CHECK6-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker +// CHECK6-SAME: () #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK6: .await.work: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK6: .select.workers: +// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK6: .execute.parallel: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK6: .terminate.parallel: +// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK6: .barrier.parallel: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 +// CHECK6-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* +// CHECK6-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK6: .worker: +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .mastercheck: +// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK6: .master: +// CHECK6-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK6-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK6-NEXT: [[TMP7:%.*]] = load i8, i8* [[CONV]], align 4 +// CHECK6-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 +// CHECK6-NEXT: store i8 [[TMP7]], i8* [[C8]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load float, float* [[CONV1]], align 4 +// CHECK6-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: store float [[TMP8]], float* [[D9]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] +// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK6: .termination.notifier: +// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTEXIT]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK6-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) +// CHECK6-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.2* +// CHECK6-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 1 +// CHECK6-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 0 +// CHECK6-NEXT: store i8 0, i8* [[C1]], align 4 +// CHECK6-NEXT: store float 1.000000e+00, float* [[D2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK6-NEXT: [[CONV:%.*]] = sext i8 [[TMP4]] to i32 +// CHECK6-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK6-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK6-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load float, float* [[D2]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 3.300000e+01 +// CHECK6-NEXT: store float [[MUL]], float* [[D2]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: store i8* [[C1]], i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[D2]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i8* [[TMP12]], i32 2048, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) +// CHECK6-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 +// CHECK6-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK6: .omp.reduction.then: +// CHECK6-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK6-NEXT: [[CONV4:%.*]] = sext i8 [[TMP15]] to i32 +// CHECK6-NEXT: [[TMP16:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK6-NEXT: [[CONV5:%.*]] = sext i8 [[TMP16]] to i32 +// CHECK6-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK6-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK6-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK6-NEXT: [[TMP17:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load float, float* [[D2]], align 4 +// CHECK6-NEXT: [[MUL8:%.*]] = fmul float [[TMP17]], [[TMP18]] +// CHECK6-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK6: .omp.reduction.done: +// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP2]]) +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK6-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK6-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK6-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK6-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK6-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK6-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK6-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 +// CHECK6-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK6-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK6-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK6-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK6-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 +// CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 +// CHECK6-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK6-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 +// CHECK6-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK6-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK6-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK6-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK6-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK6-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK6-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK6-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK6-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK6-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK6: then6: +// CHECK6-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 +// CHECK6-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 +// CHECK6-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK6-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK6-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 +// CHECK6-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 +// CHECK6-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK6-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK6-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK6-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK6-NEXT: br label [[IFCONT8:%.*]] +// CHECK6: else7: +// CHECK6-NEXT: br label [[IFCONT8]] +// CHECK6: ifcont8: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK6-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK6-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK6: then4: +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK6-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK6-NEXT: br label [[IFCONT6:%.*]] +// CHECK6: else5: +// CHECK6-NEXT: br label [[IFCONT6]] +// CHECK6: ifcont6: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK6: then8: +// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK6-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK6-NEXT: br label [[IFCONT10:%.*]] +// CHECK6: else9: +// CHECK6-NEXT: br label [[IFCONT10]] +// CHECK6: ifcont10: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK6: then12: +// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 +// CHECK6-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK6-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK6-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK6-NEXT: br label [[IFCONT14:%.*]] +// CHECK6: else13: +// CHECK6-NEXT: br label [[IFCONT14]] +// CHECK6: ifcont14: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 +// CHECK6-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 +// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 +// CHECK6-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 +// CHECK6-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 +// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 +// CHECK6-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 +// CHECK6-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK6-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK6-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK6-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK6-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK6-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK6-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2) +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK6-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 2048, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) +// CHECK6-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 +// CHECK6-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK6: .omp.reduction.then: +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK6-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] +// CHECK6-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 +// CHECK6-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK6-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 +// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 +// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK6: .omp.reduction.done: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK6-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK6-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK6-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK6-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK6-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK6-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK6-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) +// CHECK6-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK6-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK6: .omp.reduction.then: +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK6-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK6-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK6-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK6-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK6-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK6-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK6: cond.true9: +// CHECK6-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK6-NEXT: br label [[COND_END11:%.*]] +// CHECK6: cond.false10: +// CHECK6-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK6-NEXT: br label [[COND_END11]] +// CHECK6: cond.end11: +// CHECK6-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK6-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK6: .omp.reduction.done: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK6-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK6-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK6-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK6-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK6-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK6-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK6-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK6-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK6-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK6-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK6-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK6-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK6-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK6-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK6-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK6-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK6-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK6-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK6-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK6-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK6-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK6-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK6: then6: +// CHECK6-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK6-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK6-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK6-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK6-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK6-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK6-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK6-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK6-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK6-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK6-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK6-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK6-NEXT: br label [[IFCONT8:%.*]] +// CHECK6: else7: +// CHECK6-NEXT: br label [[IFCONT8]] +// CHECK6: ifcont8: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK6-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK6: then4: +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK6-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK6-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK6-NEXT: br label [[IFCONT6:%.*]] +// CHECK6: else5: +// CHECK6-NEXT: br label [[IFCONT6]] +// CHECK6: ifcont6: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK6: then8: +// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK6-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK6-NEXT: br label [[IFCONT10:%.*]] +// CHECK6: else9: +// CHECK6-NEXT: br label [[IFCONT10]] +// CHECK6: ifcont10: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK6: then12: +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK6-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK6-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK6-NEXT: br label [[IFCONT14:%.*]] +// CHECK6: else13: +// CHECK6-NEXT: br label [[IFCONT14]] +// CHECK6: ifcont14: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK6-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK6-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK6-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK6-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK6-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK6-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK6-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK6-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK6-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK6-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK6-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK6-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK6-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK6-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK6-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK6-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK6-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK6-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK6-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK6-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK6-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK6-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK6: then6: +// CHECK6-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK6-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK6-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK6-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK6-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK6-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK6-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK6-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK6-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK6-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK6-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK6-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK6-NEXT: br label [[IFCONT8:%.*]] +// CHECK6: else7: +// CHECK6-NEXT: br label [[IFCONT8]] +// CHECK6: ifcont8: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK6-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK6: then4: +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK6-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK6-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK6-NEXT: br label [[IFCONT6:%.*]] +// CHECK6: else5: +// CHECK6-NEXT: br label [[IFCONT6]] +// CHECK6: ifcont6: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK6: then8: +// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK6-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK6-NEXT: br label [[IFCONT10:%.*]] +// CHECK6: else9: +// CHECK6-NEXT: br label [[IFCONT10]] +// CHECK6: ifcont10: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK6: then12: +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK6-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK6-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK6-NEXT: br label [[IFCONT14:%.*]] +// CHECK6: else13: +// CHECK6-NEXT: br label [[IFCONT14]] +// CHECK6: ifcont14: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK6-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 +// CHECK6-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 +// CHECK6-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 +// CHECK6-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// diff --git a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c --- a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c +++ b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c @@ -6,7 +6,7 @@ void bar1(void) { // all-remark {{[OMP100] Potentially unknown OpenMP target region caller}} #pragma omp parallel // #0 - // all-remark@#0 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} + // all-remark@#0 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} // safe-remark@#0 {{Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will not attempt to rewrite the state machine use.}} // force-remark@#0 {{[UNSAFE] Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will rewrite the state machine use due to command line flag, this can lead to undefined behavior if the parallel region is called from a target region outside this translation unit.}} // force-remark@#0 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__2_wrapper, kernel ID: }} @@ -15,7 +15,7 @@ } void bar2(void) { // all-remark {{[OMP100] Potentially unknown OpenMP target region caller}} #pragma omp parallel // #1 - // all-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} + // all-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} // safe-remark@#1 {{Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will not attempt to rewrite the state machine use.}} // force-remark@#1 {{[UNSAFE] Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will rewrite the state machine use due to command line flag, this can lead to undefined behavior if the parallel region is called from a target region outside this translation unit.}} // force-remark@#1 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__6_wrapper, kernel ID: }} @@ -29,13 +29,13 @@ // all-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading}} { #pragma omp parallel // #3 - // all-remark@#3 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} + // all-remark@#3 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} // all-remark@#3 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading}} { } bar1(); #pragma omp parallel // #4 - // all-remark@#4 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} + // all-remark@#4 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} // all-remark@#4 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading}} { } @@ -48,14 +48,14 @@ // all-remark@#5 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__7_wrapper, kernel ID: __omp_offloading}} { #pragma omp parallel // #6 - // all-remark@#6 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} + // all-remark@#6 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} // all-remark@#6 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__5_wrapper, kernel ID: __omp_offloading}} { } bar1(); bar2(); #pragma omp parallel // #7 - // all-remark@#7 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} + // all-remark@#7 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} // all-remark@#7 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__7_wrapper, kernel ID: __omp_offloading}} { } @@ -70,14 +70,14 @@ // all-remark@#8 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__10_wrapper, kernel ID: __omp_offloading}} { #pragma omp parallel // #9 - // all-remark@#9 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} + // all-remark@#9 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} // all-remark@#9 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__9_wrapper, kernel ID: __omp_offloading}} { } bar1(); bar2(); #pragma omp parallel // #10 - // all-remark@#10 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} + // all-remark@#10 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} // all-remark@#10 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__10_wrapper, kernel ID: __omp_offloading}} { } @@ -98,5 +98,5 @@ } } -// all-remark@* 3 {{OpenMP runtime call __kmpc_global_thread_num moved to}} -// all-remark@* 3 {{OpenMP runtime call __kmpc_global_thread_num deduplicated}} +// all-remark@* 5 {{OpenMP runtime call __kmpc_global_thread_num moved to}} +// all-remark@* 12 {{OpenMP runtime call __kmpc_global_thread_num deduplicated}} diff --git a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c --- a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c +++ b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c @@ -6,7 +6,7 @@ void bar(void) { // expected-remark {{[OMP100] Potentially unknown OpenMP target region caller}} #pragma omp parallel // #1 \ - // expected-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \ + // expected-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \ // expected-remark@#1 {{Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will not attempt to rewrite the state machine use.}} { } @@ -18,13 +18,13 @@ // expected-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading}} { #pragma omp parallel // #3 \ - // expected-remark@#3 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \ + // expected-remark@#3 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \ // expected-remark@#3 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading}} { } bar(); #pragma omp parallel // #4 \ - // expected-remark@#4 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \ + // expected-remark@#4 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \ // expected-remark@#4 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading}} { } @@ -44,4 +44,4 @@ } // expected-remark@* {{OpenMP runtime call __kmpc_global_thread_num moved to}} -// expected-remark@* {{OpenMP runtime call __kmpc_global_thread_num deduplicated}} +// expected-remark@* 2 {{OpenMP runtime call __kmpc_global_thread_num deduplicated}} diff --git a/clang/test/OpenMP/target_parallel_debug_codegen.cpp b/clang/test/OpenMP/target_parallel_debug_codegen.cpp --- a/clang/test/OpenMP/target_parallel_debug_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_debug_codegen.cpp @@ -1,3 +1,4 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" --prefix-filecheck-ir-name _ // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-cuda-mode -emit-llvm-bc %s -o %t-ppc-host.bc -fopenmp-version=45 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-cuda-mode -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -debug-info-kind=limited -fopenmp-version=45 | FileCheck %s // expected-no-diagnostics @@ -11,7 +12,6 @@ template<> int S<&aaa>::a; template struct S<&aaa>; -// CHECK-NOT: @aaa int main() { /* int(*b)[a]; */ @@ -64,74 +64,664 @@ } return 0; } - -// CHECK: define internal void @__omp_offloading{{[^(]+}}([10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8 addrspace(1)* noalias{{[^,]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]* -// CHECK: call void [[NONDEBUG_WRAPPER:.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* {{[^,]+}}, i64 {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8* {{[^)]+}}) - -// CHECK: define internal void [[DEBUG_PARALLEL:@.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* noalias{{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]]* noalias{{[^,]+}}, i8 addrspace(1)* noalias{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]* - -// CHECK: define internal void [[NONDEBUG_WRAPPER]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i64 {{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i8* nonnull align {{[0-9]+}} dereferenceable{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]]* %{{.+}} to [10 x [10 x [10 x i32]]] addrspace(1)* -// CHECK: call void [[DEBUG_PARALLEL]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8 addrspace(1)* {{[^)]+}}) - -// CHECK: define weak void @__omp_offloading_{{[^(]+}}([10 x [10 x [10 x i32]]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i64 {{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i8* nonnull align {{[0-9]+}} dereferenceable{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]]* %{{.+}} to [10 x [10 x [10 x i32]]] addrspace(1)* -// CHECK: call void @__omp_offloading_{{[^(]+}}([10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8 addrspace(1)* {{[^)]+}}) - -// CHECK: define internal void @__omp_offloading_{{[^(]+}}([10 x [10 x [10 x i32]]] addrspace(1)* noalias{{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]] addrspace(1)* noalias{{[^,]+}}, i8 addrspace(1)* noalias{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]* -// CHECK: addrspacecast [10 x [10 x i32]] addrspace(1)* %{{.+}} to [10 x [10 x i32]]* -// CHECK: call void [[NONDEBUG_WRAPPER:.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* {{[^,]+}}, i64 {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8* {{[^)]+}}) - -// CHECK: define internal void [[DEBUG_PARALLEL:@.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* noalias{{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]] addrspace(1)* noalias{{[^,]+}}, i8 addrspace(1)* {{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]* -// CHECK: addrspacecast [10 x [10 x i32]] addrspace(1)* %{{.+}} to [10 x [10 x i32]]* - -// CHECK: define internal void [[NONDEBUG_WRAPPER]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i64 {{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i8* nonnull align {{[0-9]+}} dereferenceable{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]]* %{{.+}} to [10 x [10 x [10 x i32]]] addrspace(1)* -// CHECK: addrspacecast [10 x [10 x i32]]* %{{.+}} to [10 x [10 x i32]] addrspace(1)* -// CHECK: call void [[DEBUG_PARALLEL]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]] addrspace(1)* {{[^,]+}}, i8 addrspace(1)* {{[^)]+}}) - -// CHECK: define weak void @__omp_offloading_{{[^(]+}}([10 x [10 x [10 x i32]]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i64 {{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i8* nonnull align {{[0-9]+}} dereferenceable{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]]* %{{.+}} to [10 x [10 x [10 x i32]]] addrspace(1)* -// CHECK: addrspacecast [10 x [10 x i32]]* %{{.+}} to [10 x [10 x i32]] addrspace(1)* -// CHECK: call void @__omp_offloading_{{[^(]+}}([10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]] addrspace(1)* {{[^,]+}}, i8 addrspace(1)* {{[^)]+}}) - -// CHECK: define internal void @__omp_offloading_{{[^(]+}}([10 x [10 x [10 x i32]]] addrspace(1)* noalias{{[^,]+}}, i32 addrspace(1)* noalias{{[^,]+}}, [10 x [10 x i32]] addrspace(1)* noalias{{[^,]+}}, i8 addrspace(1)* noalias{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]* -// CHECK: addrspacecast i32 addrspace(1)* %{{.+}} to i32* -// CHECK: addrspacecast [10 x [10 x i32]] addrspace(1)* %{{.+}} to [10 x [10 x i32]]* -// CHECK: call void @[[NONDEBUG_WRAPPER:.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8* {{[^)]+}}) - -// CHECK: define internal void @[[DEBUG_PARALLEL:.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* noalias{{[^,]+}}, i32 addrspace(1)* noalias{{[^,]+}}, [10 x [10 x i32]] addrspace(1)* noalias{{[^,]+}}, i8 addrspace(1)* noalias{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]* -// CHECK: addrspacecast i32 addrspace(1)* %{{.+}} to i32* -// CHECK: addrspacecast [10 x [10 x i32]] addrspace(1)* %{{.+}} to [10 x [10 x i32]]* - -// CHECK: define internal void @[[NONDEBUG_WRAPPER]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i32* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i8* nonnull align {{[0-9]+}} dereferenceable{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]]* %{{.+}} to [10 x [10 x [10 x i32]]] addrspace(1)* -// CHECK: addrspacecast i32* %{{.+}} to i32 addrspace(1)* -// CHECK: addrspacecast [10 x [10 x i32]]* %{{.+}} to [10 x [10 x i32]] addrspace(1)* -// CHECK: call void @[[DEBUG_PARALLEL]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 addrspace(1)* {{[^,]+}}, [10 x [10 x i32]] addrspace(1)* {{[^,]+}}, i8 addrspace(1)* {{[^)]+}}) - -// CHECK: define weak void @__omp_offloading_{{[^(]+}}([10 x [10 x [10 x i32]]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i32* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i8* nonnull align {{[0-9]+}} dereferenceable{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]]* %{{.+}} to [10 x [10 x [10 x i32]]] addrspace(1)* -// CHECK: addrspacecast i32* %{{.+}} to i32 addrspace(1)* -// CHECK: addrspacecast [10 x [10 x i32]]* %{{.+}} to [10 x [10 x i32]] addrspace(1)* -// CHECK: call void @__omp_offloading_{{[^(]+}}([10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 addrspace(1)* {{[^,]+}}, [10 x [10 x i32]] addrspace(1)* {{[^,]+}}, i8 addrspace(1)* {{[^)]+}}) - -// CHECK: !DILocalVariable(name: ".global_tid.", -// CHECK-SAME: DIFlagArtificial -// CHECK: !DILocalVariable(name: ".bound_tid.", -// CHECK-SAME: DIFlagArtificial -// CHECK: !DILocalVariable(name: "c", -// CHECK-SAME: line: 22 -// CHECK: !DILocalVariable(name: "a", -// CHECK-SAME: line: 20 -// CHECK: !DILocalVariable(name: "b", -// CHECK-SAME: line: 21 - -// CHECK-DAG: distinct !DISubprogram(name: "[[NONDEBUG_WRAPPER]]", -// CHECK-DAG: distinct !DISubprogram(name: "[[DEBUG_PARALLEL]]", +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__ +// CHECK-SAME: ([10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 [[A:%.*]], [10 x [10 x i32]]* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG22:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]] addrspace(1)*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8 addrspace(1)*, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[_TMP2:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK-NEXT: store [10 x [10 x [10 x i32]]] addrspace(1)* [[C]], [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META37:![0-9]+]], metadata !DIExpression()), !dbg [[DBG38:![0-9]+]] +// CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META39:![0-9]+]], metadata !DIExpression()), !dbg [[DBG40:![0-9]+]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META41:![0-9]+]], metadata !DIExpression()), !dbg [[DBG42:![0-9]+]] +// CHECK-NEXT: store i8 addrspace(1)* [[BB]], i8 addrspace(1)** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META43:![0-9]+]], metadata !DIExpression()), !dbg [[DBG44:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG45:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG45]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG45]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG45]] +// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG45]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP3]], [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG45]] +// CHECK-NEXT: [[TMP4:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG45]] +// CHECK-NEXT: [[TMP5:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG45]] +// CHECK-NEXT: [[TMP6:%.*]] = addrspacecast i8 addrspace(1)* [[TMP5]] to i8*, !dbg [[DBG45]] +// CHECK-NEXT: store i8* [[TMP6]], i8** [[_TMP2]], align 8, !dbg [[DBG45]] +// CHECK-NEXT: [[TMP7:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG45]] +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg [[DBG45]] +// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1), !dbg [[DBG45]] +// CHECK-NEXT: call void @__kmpc_data_sharing_init_stack_spmd(), !dbg [[DBG45]] +// CHECK-NEXT: br label [[DOTEXECUTE:%.*]], !dbg [[DBG45]] +// CHECK: .execute: +// CHECK-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG46:![0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*, !dbg [[DBG46]] +// CHECK-NEXT: store i32 [[TMP9]], i32* [[CONV]], align 4, !dbg [[DBG46]] +// CHECK-NEXT: [[TMP10:%.*]] = load i64, i64* [[A_CASTED]], align 8, !dbg [[DBG46]] +// CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG46]] +// CHECK-NEXT: [[TMP12:%.*]] = bitcast [10 x [10 x [10 x i32]]]* [[TMP2]] to i8*, !dbg [[DBG46]] +// CHECK-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8, !dbg [[DBG46]] +// CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG46]] +// CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8*, !dbg [[DBG46]] +// CHECK-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8, !dbg [[DBG46]] +// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG46]] +// CHECK-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP4]] to i8*, !dbg [[DBG46]] +// CHECK-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8, !dbg [[DBG46]] +// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG46]] +// CHECK-NEXT: store i8* [[TMP7]], i8** [[TMP17]], align 8, !dbg [[DBG46]] +// CHECK-NEXT: [[TMP18:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**, !dbg [[DBG46]] +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x [10 x [10 x i32]]]*, i64, [10 x [10 x i32]]*, i8*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP18]], i64 4), !dbg [[DBG46]] +// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]], !dbg [[DBG47:![0-9]+]] +// CHECK: .omp.deinit: +// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1), !dbg [[DBG47]] +// CHECK-NEXT: br label [[DOTEXIT:%.*]], !dbg [[DBG47]] +// CHECK: .exit: +// CHECK-NEXT: ret void, !dbg [[DBG49:![0-9]+]] +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined___debug__ +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 [[A:%.*]], [10 x [10 x i32]]* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG50:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]] addrspace(1)*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8 addrspace(1)*, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[_TMP2:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: [[B3:%.*]] = alloca [10 x [10 x i32]], align 4 +// CHECK-NEXT: [[F:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[G:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[H:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[D:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META57:![0-9]+]], metadata !DIExpression()), !dbg [[DBG58:![0-9]+]] +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META59:![0-9]+]], metadata !DIExpression()), !dbg [[DBG58]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]] addrspace(1)* [[C]], [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META60:![0-9]+]], metadata !DIExpression()), !dbg [[DBG61:![0-9]+]] +// CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META62:![0-9]+]], metadata !DIExpression()), !dbg [[DBG63:![0-9]+]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META64:![0-9]+]], metadata !DIExpression()), !dbg [[DBG65:![0-9]+]] +// CHECK-NEXT: store i8 addrspace(1)* [[BB]], i8 addrspace(1)** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META66:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG68:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG68]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG68]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG68]] +// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG68]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP3]], [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG68]] +// CHECK-NEXT: [[TMP4:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG68]] +// CHECK-NEXT: [[TMP5:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG68]] +// CHECK-NEXT: [[TMP6:%.*]] = addrspacecast i8 addrspace(1)* [[TMP5]] to i8*, !dbg [[DBG68]] +// CHECK-NEXT: store i8* [[TMP6]], i8** [[_TMP2]], align 8, !dbg [[DBG68]] +// CHECK-NEXT: [[TMP7:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG68]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]* [[B3]], metadata [[META69:![0-9]+]], metadata !DIExpression()), !dbg [[DBG58]] +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [10 x [10 x i32]]* [[B3]] to i8*, !dbg [[DBG68]] +// CHECK-NEXT: [[TMP9:%.*]] = bitcast [10 x [10 x i32]]* [[TMP4]] to i8*, !dbg [[DBG68]] +// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i64 400, i1 false), !dbg [[DBG68]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[F]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG73:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 1, !dbg [[DBG74:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG74]] +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG74]] +// CHECK-NEXT: store i32* [[ARRAYIDX5]], i32** [[F]], align 8, !dbg [[DBG73]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[G]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG76:![0-9]+]] +// CHECK-NEXT: store i32* [[A_ADDR]], i32** [[G]], align 8, !dbg [[DBG76]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[H]], metadata [[META77:![0-9]+]], metadata !DIExpression()), !dbg [[DBG78:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[B3]], i64 0, i64 1, !dbg [[DBG79:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG79]] +// CHECK-NEXT: store i32* [[ARRAYIDX7]], i32** [[H]], align 8, !dbg [[DBG78]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[D]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG81:![0-9]+]] +// CHECK-NEXT: store i32 15, i32* [[D]], align 4, !dbg [[DBG81]] +// CHECK-NEXT: store i32 5, i32* [[A_ADDR]], align 4, !dbg [[DBG82:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[B3]], i64 0, i64 0, !dbg [[DBG83:![0-9]+]] +// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG84:![0-9]+]] +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG83]] +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG83]] +// CHECK-NEXT: store i32 10, i32* [[ARRAYIDX9]], align 4, !dbg [[DBG85:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG86:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG86]] +// CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG87:![0-9]+]] +// CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP11]] to i64, !dbg [[DBG86]] +// CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG86]] +// CHECK-NEXT: store i32 11, i32* [[ARRAYIDX13]], align 4, !dbg [[DBG88:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG89:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG89]] +// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG90:![0-9]+]] +// CHECK-NEXT: [[IDXPROM16:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG89]] +// CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG89]] +// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX17]], align 4, !dbg [[DBG89]] +// CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[B3]], i64 0, i64 0, !dbg [[DBG91:![0-9]+]] +// CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG92:![0-9]+]] +// CHECK-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP14]] to i64, !dbg [[DBG91]] +// CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG91]] +// CHECK-NEXT: store i32 [[TMP13]], i32* [[ARRAYIDX20]], align 4, !dbg [[DBG93:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[B3]], i64 0, i64 0, !dbg [[DBG94:![0-9]+]] +// CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG95:![0-9]+]] +// CHECK-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP15]] to i64, !dbg [[DBG94]] +// CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG94]] +// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX23]], align 4, !dbg [[DBG94]] +// CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* [[TMP7]], align 1, !dbg [[DBG96:![0-9]+]] +// CHECK-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG96]] +// CHECK-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG96]] +// CHECK-NEXT: [[OR:%.*]] = or i32 [[CONV]], [[TMP16]], !dbg [[DBG96]] +// CHECK-NEXT: [[TOBOOL24:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG96]] +// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL24]] to i8, !dbg [[DBG96]] +// CHECK-NEXT: store i8 [[FROMBOOL]], i8* [[TMP7]], align 1, !dbg [[DBG96]] +// CHECK-NEXT: ret void, !dbg [[DBG97:![0-9]+]] +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG98:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META105:![0-9]+]], metadata !DIExpression()), !dbg [[DBG106:![0-9]+]] +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META107:![0-9]+]], metadata !DIExpression()), !dbg [[DBG106]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[C]], [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META108:![0-9]+]], metadata !DIExpression()), !dbg [[DBG106]] +// CHECK-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i64* [[A_ADDR]], metadata [[META109:![0-9]+]], metadata !DIExpression()), !dbg [[DBG106]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META110:![0-9]+]], metadata !DIExpression()), !dbg [[DBG106]] +// CHECK-NEXT: store i8* [[BB]], i8** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META111:![0-9]+]], metadata !DIExpression()), !dbg [[DBG106]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG112:![0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*, !dbg [[DBG112]] +// CHECK-NEXT: [[TMP1:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG112]] +// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG112]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG112]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG112]] +// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG112]] +// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8, !dbg [[DBG112]] +// CHECK-NEXT: [[TMP7:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG112]] +// CHECK-NEXT: [[TMP8:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG112]] +// CHECK-NEXT: [[TMP9:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP5]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG112]] +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast i8* [[TMP8]] to i8 addrspace(1)*, !dbg [[DBG112]] +// CHECK-NEXT: call void @__omp_outlined___debug__(i32* [[TMP3]], i32* [[TMP4]], [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP9]], i32 [[TMP6]], [10 x [10 x i32]]* [[TMP7]], i8 addrspace(1)* [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG112]] +// CHECK-NEXT: ret void, !dbg [[DBG112]] +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23 +// CHECK-SAME: ([10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG113:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[C]], [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META116:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117:![0-9]+]] +// CHECK-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i64* [[A_ADDR]], metadata [[META118:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META119:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117]] +// CHECK-NEXT: store i8* [[BB]], i8** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META120:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG121:![0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*, !dbg [[DBG121]] +// CHECK-NEXT: [[TMP1:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG121]] +// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG121]] +// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG121]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV]], align 8, !dbg [[DBG121]] +// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG121]] +// CHECK-NEXT: [[TMP6:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG121]] +// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP3]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG121]] +// CHECK-NEXT: [[TMP8:%.*]] = addrspacecast i8* [[TMP6]] to i8 addrspace(1)*, !dbg [[DBG121]] +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__([10 x [10 x [10 x i32]]] addrspace(1)* [[TMP7]], i32 [[TMP4]], [10 x [10 x i32]]* [[TMP5]], i8 addrspace(1)* [[TMP8]]) #[[ATTR4]], !dbg [[DBG121]] +// CHECK-NEXT: ret void, !dbg [[DBG121]] +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug__ +// CHECK-SAME: ([10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 [[A:%.*]], [10 x [10 x i32]] addrspace(1)* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG122:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]] addrspace(1)*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]] addrspace(1)*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8 addrspace(1)*, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[_TMP2:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK-NEXT: store [10 x [10 x [10 x i32]]] addrspace(1)* [[C]], [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META127:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128:![0-9]+]] +// CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130:![0-9]+]] +// CHECK-NEXT: store [10 x [10 x i32]] addrspace(1)* [[B]], [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG132:![0-9]+]] +// CHECK-NEXT: store i8 addrspace(1)* [[BB]], i8 addrspace(1)** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG134:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG135:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG135]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG135]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG135]] +// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x i32]] addrspace(1)*, [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8, !dbg [[DBG135]] +// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast [10 x [10 x i32]] addrspace(1)* [[TMP3]] to [10 x [10 x i32]]*, !dbg [[DBG135]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP4]], [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG135]] +// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG135]] +// CHECK-NEXT: [[TMP6:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG135]] +// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast i8 addrspace(1)* [[TMP6]] to i8*, !dbg [[DBG135]] +// CHECK-NEXT: store i8* [[TMP7]], i8** [[_TMP2]], align 8, !dbg [[DBG135]] +// CHECK-NEXT: [[TMP8:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG135]] +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg [[DBG135]] +// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1), !dbg [[DBG135]] +// CHECK-NEXT: call void @__kmpc_data_sharing_init_stack_spmd(), !dbg [[DBG135]] +// CHECK-NEXT: br label [[DOTEXECUTE:%.*]], !dbg [[DBG135]] +// CHECK: .execute: +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG136:![0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*, !dbg [[DBG136]] +// CHECK-NEXT: store i32 [[TMP10]], i32* [[CONV]], align 4, !dbg [[DBG136]] +// CHECK-NEXT: [[TMP11:%.*]] = load i64, i64* [[A_CASTED]], align 8, !dbg [[DBG136]] +// CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG136]] +// CHECK-NEXT: [[TMP13:%.*]] = bitcast [10 x [10 x [10 x i32]]]* [[TMP2]] to i8*, !dbg [[DBG136]] +// CHECK-NEXT: store i8* [[TMP13]], i8** [[TMP12]], align 8, !dbg [[DBG136]] +// CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG136]] +// CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP11]] to i8*, !dbg [[DBG136]] +// CHECK-NEXT: store i8* [[TMP15]], i8** [[TMP14]], align 8, !dbg [[DBG136]] +// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG136]] +// CHECK-NEXT: [[TMP17:%.*]] = bitcast [10 x [10 x i32]]* [[TMP5]] to i8*, !dbg [[DBG136]] +// CHECK-NEXT: store i8* [[TMP17]], i8** [[TMP16]], align 8, !dbg [[DBG136]] +// CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG136]] +// CHECK-NEXT: store i8* [[TMP8]], i8** [[TMP18]], align 8, !dbg [[DBG136]] +// CHECK-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**, !dbg [[DBG136]] +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP9]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x [10 x [10 x i32]]]*, i64, [10 x [10 x i32]]*, i8*)* @__omp_outlined__2 to i8*), i8* null, i8** [[TMP19]], i64 4), !dbg [[DBG136]] +// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]], !dbg [[DBG137:![0-9]+]] +// CHECK: .omp.deinit: +// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1), !dbg [[DBG137]] +// CHECK-NEXT: br label [[DOTEXIT:%.*]], !dbg [[DBG137]] +// CHECK: .exit: +// CHECK-NEXT: ret void, !dbg [[DBG139:![0-9]+]] +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined___debug__1 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 [[A:%.*]], [10 x [10 x i32]] addrspace(1)* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG140:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]] addrspace(1)*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]] addrspace(1)*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8 addrspace(1)*, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[_TMP2:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: [[F:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[G:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[H:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[D:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META143:![0-9]+]], metadata !DIExpression()), !dbg [[DBG144:![0-9]+]] +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META145:![0-9]+]], metadata !DIExpression()), !dbg [[DBG144]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]] addrspace(1)* [[C]], [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META146:![0-9]+]], metadata !DIExpression()), !dbg [[DBG147:![0-9]+]] +// CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META148:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149:![0-9]+]] +// CHECK-NEXT: store [10 x [10 x i32]] addrspace(1)* [[B]], [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], metadata [[META150:![0-9]+]], metadata !DIExpression()), !dbg [[DBG151:![0-9]+]] +// CHECK-NEXT: store i8 addrspace(1)* [[BB]], i8 addrspace(1)** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META152:![0-9]+]], metadata !DIExpression()), !dbg [[DBG153:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG154:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG154]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG154]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG154]] +// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x i32]] addrspace(1)*, [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8, !dbg [[DBG154]] +// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast [10 x [10 x i32]] addrspace(1)* [[TMP3]] to [10 x [10 x i32]]*, !dbg [[DBG154]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP4]], [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG154]] +// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG154]] +// CHECK-NEXT: [[TMP6:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG154]] +// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast i8 addrspace(1)* [[TMP6]] to i8*, !dbg [[DBG154]] +// CHECK-NEXT: store i8* [[TMP7]], i8** [[_TMP2]], align 8, !dbg [[DBG154]] +// CHECK-NEXT: [[TMP8:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG154]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[F]], metadata [[META155:![0-9]+]], metadata !DIExpression()), !dbg [[DBG157:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 1, !dbg [[DBG158:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG158]] +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX3]], i64 0, i64 1, !dbg [[DBG158]] +// CHECK-NEXT: store i32* [[ARRAYIDX4]], i32** [[F]], align 8, !dbg [[DBG157]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[G]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]] +// CHECK-NEXT: store i32* [[A_ADDR]], i32** [[G]], align 8, !dbg [[DBG160]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[H]], metadata [[META161:![0-9]+]], metadata !DIExpression()), !dbg [[DBG162:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP5]], i64 0, i64 1, !dbg [[DBG163:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX5]], i64 0, i64 1, !dbg [[DBG163]] +// CHECK-NEXT: store i32* [[ARRAYIDX6]], i32** [[H]], align 8, !dbg [[DBG162]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[D]], metadata [[META164:![0-9]+]], metadata !DIExpression()), !dbg [[DBG165:![0-9]+]] +// CHECK-NEXT: store i32 15, i32* [[D]], align 4, !dbg [[DBG165]] +// CHECK-NEXT: store i32 5, i32* [[A_ADDR]], align 4, !dbg [[DBG166:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP5]], i64 0, i64 0, !dbg [[DBG167:![0-9]+]] +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG168:![0-9]+]] +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG167]] +// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX7]], i64 0, i64 [[IDXPROM]], !dbg [[DBG167]] +// CHECK-NEXT: store i32 10, i32* [[ARRAYIDX8]], align 4, !dbg [[DBG169:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG170:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX9]], i64 0, i64 0, !dbg [[DBG170]] +// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG171:![0-9]+]] +// CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG170]] +// CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX10]], i64 0, i64 [[IDXPROM11]], !dbg [[DBG170]] +// CHECK-NEXT: store i32 11, i32* [[ARRAYIDX12]], align 4, !dbg [[DBG172:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG173:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG173]] +// CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG174:![0-9]+]] +// CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[TMP11]] to i64, !dbg [[DBG173]] +// CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG173]] +// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX16]], align 4, !dbg [[DBG173]] +// CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP5]], i64 0, i64 0, !dbg [[DBG175:![0-9]+]] +// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG176:![0-9]+]] +// CHECK-NEXT: [[IDXPROM18:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG175]] +// CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG175]] +// CHECK-NEXT: store i32 [[TMP12]], i32* [[ARRAYIDX19]], align 4, !dbg [[DBG177:![0-9]+]] +// CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP8]], align 1, !dbg [[DBG178:![0-9]+]] +// CHECK-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP14]] to i1, !dbg [[DBG178]] +// CHECK-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG178]] +// CHECK-NEXT: store i32 [[CONV]], i32* [[D]], align 4, !dbg [[DBG179:![0-9]+]] +// CHECK-NEXT: ret void, !dbg [[DBG180:![0-9]+]] +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG181:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META182:![0-9]+]], metadata !DIExpression()), !dbg [[DBG183:![0-9]+]] +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META184:![0-9]+]], metadata !DIExpression()), !dbg [[DBG183]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[C]], [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META185:![0-9]+]], metadata !DIExpression()), !dbg [[DBG183]] +// CHECK-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i64* [[A_ADDR]], metadata [[META186:![0-9]+]], metadata !DIExpression()), !dbg [[DBG183]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META187:![0-9]+]], metadata !DIExpression()), !dbg [[DBG183]] +// CHECK-NEXT: store i8* [[BB]], i8** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META188:![0-9]+]], metadata !DIExpression()), !dbg [[DBG183]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG189:![0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*, !dbg [[DBG189]] +// CHECK-NEXT: [[TMP1:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG189]] +// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG189]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG189]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG189]] +// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG189]] +// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8, !dbg [[DBG189]] +// CHECK-NEXT: [[TMP7:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG189]] +// CHECK-NEXT: [[TMP8:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG189]] +// CHECK-NEXT: [[TMP9:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP5]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG189]] +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast [10 x [10 x i32]]* [[TMP7]] to [10 x [10 x i32]] addrspace(1)*, !dbg [[DBG189]] +// CHECK-NEXT: [[TMP11:%.*]] = addrspacecast i8* [[TMP8]] to i8 addrspace(1)*, !dbg [[DBG189]] +// CHECK-NEXT: call void @__omp_outlined___debug__1(i32* [[TMP3]], i32* [[TMP4]], [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP9]], i32 [[TMP6]], [10 x [10 x i32]] addrspace(1)* [[TMP10]], i8 addrspace(1)* [[TMP11]]) #[[ATTR4]], !dbg [[DBG189]] +// CHECK-NEXT: ret void, !dbg [[DBG189]] +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37 +// CHECK-SAME: ([10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG190:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[C]], [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META191:![0-9]+]], metadata !DIExpression()), !dbg [[DBG192:![0-9]+]] +// CHECK-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i64* [[A_ADDR]], metadata [[META193:![0-9]+]], metadata !DIExpression()), !dbg [[DBG192]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META194:![0-9]+]], metadata !DIExpression()), !dbg [[DBG192]] +// CHECK-NEXT: store i8* [[BB]], i8** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META195:![0-9]+]], metadata !DIExpression()), !dbg [[DBG192]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG196:![0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*, !dbg [[DBG196]] +// CHECK-NEXT: [[TMP1:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG196]] +// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG196]] +// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG196]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV]], align 8, !dbg [[DBG196]] +// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG196]] +// CHECK-NEXT: [[TMP6:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG196]] +// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP3]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG196]] +// CHECK-NEXT: [[TMP8:%.*]] = addrspacecast [10 x [10 x i32]]* [[TMP5]] to [10 x [10 x i32]] addrspace(1)*, !dbg [[DBG196]] +// CHECK-NEXT: [[TMP9:%.*]] = addrspacecast i8* [[TMP6]] to i8 addrspace(1)*, !dbg [[DBG196]] +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug__([10 x [10 x [10 x i32]]] addrspace(1)* [[TMP7]], i32 [[TMP4]], [10 x [10 x i32]] addrspace(1)* [[TMP8]], i8 addrspace(1)* [[TMP9]]) #[[ATTR4]], !dbg [[DBG196]] +// CHECK-NEXT: ret void, !dbg [[DBG196]] +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug__ +// CHECK-SAME: ([10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 addrspace(1)* noalias [[A:%.*]], [10 x [10 x i32]] addrspace(1)* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG197:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]] addrspace(1)*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32 addrspace(1)*, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]] addrspace(1)*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8 addrspace(1)*, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[_TMP2:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[_TMP3:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK-NEXT: store [10 x [10 x [10 x i32]]] addrspace(1)* [[C]], [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META202:![0-9]+]], metadata !DIExpression()), !dbg [[DBG203:![0-9]+]] +// CHECK-NEXT: store i32 addrspace(1)* [[A]], i32 addrspace(1)** [[A_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32 addrspace(1)** [[A_ADDR]], metadata [[META204:![0-9]+]], metadata !DIExpression()), !dbg [[DBG205:![0-9]+]] +// CHECK-NEXT: store [10 x [10 x i32]] addrspace(1)* [[B]], [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], metadata [[META206:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207:![0-9]+]] +// CHECK-NEXT: store i8 addrspace(1)* [[BB]], i8 addrspace(1)** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META208:![0-9]+]], metadata !DIExpression()), !dbg [[DBG209:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG210:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG210]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG210]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG210]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)** [[A_ADDR]], align 8, !dbg [[DBG210]] +// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast i32 addrspace(1)* [[TMP3]] to i32*, !dbg [[DBG210]] +// CHECK-NEXT: store i32* [[TMP4]], i32** [[_TMP1]], align 8, !dbg [[DBG210]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[_TMP1]], align 8, !dbg [[DBG210]] +// CHECK-NEXT: [[TMP6:%.*]] = load [10 x [10 x i32]] addrspace(1)*, [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8, !dbg [[DBG210]] +// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast [10 x [10 x i32]] addrspace(1)* [[TMP6]] to [10 x [10 x i32]]*, !dbg [[DBG210]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP7]], [10 x [10 x i32]]** [[_TMP2]], align 8, !dbg [[DBG210]] +// CHECK-NEXT: [[TMP8:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP2]], align 8, !dbg [[DBG210]] +// CHECK-NEXT: [[TMP9:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG210]] +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast i8 addrspace(1)* [[TMP9]] to i8*, !dbg [[DBG210]] +// CHECK-NEXT: store i8* [[TMP10]], i8** [[_TMP3]], align 8, !dbg [[DBG210]] +// CHECK-NEXT: [[TMP11:%.*]] = load i8*, i8** [[_TMP3]], align 8, !dbg [[DBG210]] +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg [[DBG210]] +// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1), !dbg [[DBG210]] +// CHECK-NEXT: call void @__kmpc_data_sharing_init_stack_spmd(), !dbg [[DBG210]] +// CHECK-NEXT: br label [[DOTEXECUTE:%.*]], !dbg [[DBG210]] +// CHECK: .execute: +// CHECK-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB5:[0-9]+]]) +// CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG211:![0-9]+]] +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [10 x [10 x [10 x i32]]]* [[TMP2]] to i8*, !dbg [[DBG211]] +// CHECK-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP5]] to i8*, !dbg [[DBG211]] +// CHECK-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP8]] to i8*, !dbg [[DBG211]] +// CHECK-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG211]] +// CHECK-NEXT: store i8* [[TMP11]], i8** [[TMP19]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP20:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**, !dbg [[DBG211]] +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB5]], i32 [[TMP12]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x [10 x [10 x i32]]]*, i32*, [10 x [10 x i32]]*, i8*)* @__omp_outlined__4 to i8*), i8* null, i8** [[TMP20]], i64 4), !dbg [[DBG211]] +// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]], !dbg [[DBG212:![0-9]+]] +// CHECK: .omp.deinit: +// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1), !dbg [[DBG212]] +// CHECK-NEXT: br label [[DOTEXIT:%.*]], !dbg [[DBG212]] +// CHECK: .exit: +// CHECK-NEXT: ret void, !dbg [[DBG214:![0-9]+]] +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined___debug__3 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 addrspace(1)* noalias [[A:%.*]], [10 x [10 x i32]] addrspace(1)* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG215:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]] addrspace(1)*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32 addrspace(1)*, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]] addrspace(1)*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8 addrspace(1)*, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[_TMP2:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[_TMP3:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: [[F:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[G:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[H:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[D:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META218:![0-9]+]], metadata !DIExpression()), !dbg [[DBG219:![0-9]+]] +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META220:![0-9]+]], metadata !DIExpression()), !dbg [[DBG219]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]] addrspace(1)* [[C]], [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META221:![0-9]+]], metadata !DIExpression()), !dbg [[DBG222:![0-9]+]] +// CHECK-NEXT: store i32 addrspace(1)* [[A]], i32 addrspace(1)** [[A_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32 addrspace(1)** [[A_ADDR]], metadata [[META223:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224:![0-9]+]] +// CHECK-NEXT: store [10 x [10 x i32]] addrspace(1)* [[B]], [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], metadata [[META225:![0-9]+]], metadata !DIExpression()), !dbg [[DBG226:![0-9]+]] +// CHECK-NEXT: store i8 addrspace(1)* [[BB]], i8 addrspace(1)** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META227:![0-9]+]], metadata !DIExpression()), !dbg [[DBG228:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG229:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG229]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG229]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG229]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)** [[A_ADDR]], align 8, !dbg [[DBG229]] +// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast i32 addrspace(1)* [[TMP3]] to i32*, !dbg [[DBG229]] +// CHECK-NEXT: store i32* [[TMP4]], i32** [[_TMP1]], align 8, !dbg [[DBG229]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[_TMP1]], align 8, !dbg [[DBG229]] +// CHECK-NEXT: [[TMP6:%.*]] = load [10 x [10 x i32]] addrspace(1)*, [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8, !dbg [[DBG229]] +// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast [10 x [10 x i32]] addrspace(1)* [[TMP6]] to [10 x [10 x i32]]*, !dbg [[DBG229]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP7]], [10 x [10 x i32]]** [[_TMP2]], align 8, !dbg [[DBG229]] +// CHECK-NEXT: [[TMP8:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP2]], align 8, !dbg [[DBG229]] +// CHECK-NEXT: [[TMP9:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG229]] +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast i8 addrspace(1)* [[TMP9]] to i8*, !dbg [[DBG229]] +// CHECK-NEXT: store i8* [[TMP10]], i8** [[_TMP3]], align 8, !dbg [[DBG229]] +// CHECK-NEXT: [[TMP11:%.*]] = load i8*, i8** [[_TMP3]], align 8, !dbg [[DBG229]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[F]], metadata [[META230:![0-9]+]], metadata !DIExpression()), !dbg [[DBG232:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 1, !dbg [[DBG233:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG233]] +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG233]] +// CHECK-NEXT: store i32* [[ARRAYIDX5]], i32** [[F]], align 8, !dbg [[DBG232]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[G]], metadata [[META234:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235:![0-9]+]] +// CHECK-NEXT: store i32* [[TMP5]], i32** [[G]], align 8, !dbg [[DBG235]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[H]], metadata [[META236:![0-9]+]], metadata !DIExpression()), !dbg [[DBG237:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP8]], i64 0, i64 1, !dbg [[DBG238:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG238]] +// CHECK-NEXT: store i32* [[ARRAYIDX7]], i32** [[H]], align 8, !dbg [[DBG237]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[D]], metadata [[META239:![0-9]+]], metadata !DIExpression()), !dbg [[DBG240:![0-9]+]] +// CHECK-NEXT: store i32 15, i32* [[D]], align 4, !dbg [[DBG240]] +// CHECK-NEXT: store i32 5, i32* [[TMP5]], align 4, !dbg [[DBG241:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP8]], i64 0, i64 0, !dbg [[DBG242:![0-9]+]] +// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG243:![0-9]+]] +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG242]] +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG242]] +// CHECK-NEXT: store i32 10, i32* [[ARRAYIDX9]], align 4, !dbg [[DBG244:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG245:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG245]] +// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG246:![0-9]+]] +// CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG245]] +// CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG245]] +// CHECK-NEXT: store i32 11, i32* [[ARRAYIDX13]], align 4, !dbg [[DBG247:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG248:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG248]] +// CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG249:![0-9]+]] +// CHECK-NEXT: [[IDXPROM16:%.*]] = sext i32 [[TMP14]] to i64, !dbg [[DBG248]] +// CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG248]] +// CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX17]], align 4, !dbg [[DBG248]] +// CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP8]], i64 0, i64 0, !dbg [[DBG250:![0-9]+]] +// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG251:![0-9]+]] +// CHECK-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP16]] to i64, !dbg [[DBG250]] +// CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG250]] +// CHECK-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX20]], align 4, !dbg [[DBG252:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP8]], i64 0, i64 0, !dbg [[DBG253:![0-9]+]] +// CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG254:![0-9]+]] +// CHECK-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP17]] to i64, !dbg [[DBG253]] +// CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG253]] +// CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX23]], align 4, !dbg [[DBG253]] +// CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP18]], 0, !dbg [[DBG253]] +// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG255:![0-9]+]] +// CHECK-NEXT: store i8 [[FROMBOOL]], i8* [[TMP11]], align 1, !dbg [[DBG255]] +// CHECK-NEXT: ret void, !dbg [[DBG256:![0-9]+]] +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG257:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META260:![0-9]+]], metadata !DIExpression()), !dbg [[DBG261:![0-9]+]] +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META262:![0-9]+]], metadata !DIExpression()), !dbg [[DBG261]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[C]], [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META263:![0-9]+]], metadata !DIExpression()), !dbg [[DBG261]] +// CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[A_ADDR]], metadata [[META264:![0-9]+]], metadata !DIExpression()), !dbg [[DBG261]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META265:![0-9]+]], metadata !DIExpression()), !dbg [[DBG261]] +// CHECK-NEXT: store i8* [[BB]], i8** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG261]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG267:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8, !dbg [[DBG267]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG267]] +// CHECK-NEXT: [[TMP3:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG267]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG267]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG267]] +// CHECK-NEXT: [[TMP6:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG267]] +// CHECK-NEXT: [[TMP7:%.*]] = load i32*, i32** [[A_ADDR]], align 8, !dbg [[DBG267]] +// CHECK-NEXT: [[TMP8:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG267]] +// CHECK-NEXT: [[TMP9:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG267]] +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP6]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG267]] +// CHECK-NEXT: [[TMP11:%.*]] = addrspacecast i32* [[TMP7]] to i32 addrspace(1)*, !dbg [[DBG267]] +// CHECK-NEXT: [[TMP12:%.*]] = addrspacecast [10 x [10 x i32]]* [[TMP8]] to [10 x [10 x i32]] addrspace(1)*, !dbg [[DBG267]] +// CHECK-NEXT: [[TMP13:%.*]] = addrspacecast i8* [[TMP9]] to i8 addrspace(1)*, !dbg [[DBG267]] +// CHECK-NEXT: call void @__omp_outlined___debug__3(i32* [[TMP4]], i32* [[TMP5]], [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP10]], i32 addrspace(1)* [[TMP11]], [10 x [10 x i32]] addrspace(1)* [[TMP12]], i8 addrspace(1)* [[TMP13]]) #[[ATTR4]], !dbg [[DBG267]] +// CHECK-NEXT: ret void, !dbg [[DBG267]] +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51 +// CHECK-SAME: ([10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG268:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[C]], [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272:![0-9]+]] +// CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[A_ADDR]], metadata [[META273:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META274:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272]] +// CHECK-NEXT: store i8* [[BB]], i8** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META275:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG276:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8, !dbg [[DBG276]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG276]] +// CHECK-NEXT: [[TMP3:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG276]] +// CHECK-NEXT: [[TMP4:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG276]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[A_ADDR]], align 8, !dbg [[DBG276]] +// CHECK-NEXT: [[TMP6:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG276]] +// CHECK-NEXT: [[TMP7:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG276]] +// CHECK-NEXT: [[TMP8:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP4]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG276]] +// CHECK-NEXT: [[TMP9:%.*]] = addrspacecast i32* [[TMP5]] to i32 addrspace(1)*, !dbg [[DBG276]] +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast [10 x [10 x i32]]* [[TMP6]] to [10 x [10 x i32]] addrspace(1)*, !dbg [[DBG276]] +// CHECK-NEXT: [[TMP11:%.*]] = addrspacecast i8* [[TMP7]] to i8 addrspace(1)*, !dbg [[DBG276]] +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug__([10 x [10 x [10 x i32]]] addrspace(1)* [[TMP8]], i32 addrspace(1)* [[TMP9]], [10 x [10 x i32]] addrspace(1)* [[TMP10]], i8 addrspace(1)* [[TMP11]]) #[[ATTR4]], !dbg [[DBG276]] +// CHECK-NEXT: ret void, !dbg [[DBG276]] +// diff --git a/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp b/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp --- a/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp @@ -1,3 +1,4 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" --prefix-filecheck-ir-name _ // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-cuda-mode -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-cuda-mode -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -debug-info-kind=limited | FileCheck %s // expected-no-diagnostics @@ -53,63 +54,893 @@ } return 0; } - -// CHECK: define internal void @__omp_offloading{{[^(]+}}([10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8 addrspace(1)* noalias{{[^,]+}}, i1 {{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]* -// CHECK: call void [[NONDEBUG_WRAPPER:.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* {{[^,]+}}, i64 {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8* {{[^)]+}}) - -// CHECK: define internal void [[DEBUG_PARALLEL:.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* noalias{{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]]* noalias{{[^,]+}}, i8 addrspace(1)* noalias{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]* - -// CHECK: define internal void [[NONDEBUG_WRAPPER]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i64 {{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i8* nonnull align {{[0-9]+}} dereferenceable{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]]* %{{.+}} to [10 x [10 x [10 x i32]]] addrspace(1)* -// CHECK: call void [[DEBUG_PARALLEL]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8 addrspace(1)* {{[^)]+}}) - -// CHECK: define weak void @__omp_offloading_{{[^(]+}}([10 x [10 x [10 x i32]]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i64 {{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i8* nonnull align {{[0-9]+}} dereferenceable{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]]* %{{.+}} to [10 x [10 x [10 x i32]]] addrspace(1)* -// CHECK: call void @__omp_offloading_{{[^(]+}}([10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8 addrspace(1)* {{[^)]+}}) - -// CHECK: define internal void @__omp_offloading_{{[^(]+}}([10 x [10 x [10 x i32]]] addrspace(1)* noalias{{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]] addrspace(1)* noalias{{[^,]+}}, i8 addrspace(1)* noalias{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]* -// CHECK: addrspacecast [10 x [10 x i32]] addrspace(1)* %{{.+}} to [10 x [10 x i32]]* -// CHECK: call void [[NONDEBUG_WRAPPER:.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* {{[^,]+}}, i64 {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8* {{[^)]+}}) - -// CHECK: define internal void [[DEBUG_PARALLEL:@.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* noalias{{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]] addrspace(1)* noalias{{[^,]+}}, i8 addrspace(1)* {{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]* -// CHECK: addrspacecast [10 x [10 x i32]] addrspace(1)* %{{.+}} to [10 x [10 x i32]]* - -// CHECK: define internal void [[NONDEBUG_WRAPPER]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i64 {{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i8* nonnull align {{[0-9]+}} dereferenceable{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]]* %{{.+}} to [10 x [10 x [10 x i32]]] addrspace(1)* -// CHECK: addrspacecast [10 x [10 x i32]]* %{{.+}} to [10 x [10 x i32]] addrspace(1)* -// CHECK: call void [[DEBUG_PARALLEL]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]] addrspace(1)* {{[^,]+}}, i8 addrspace(1)* {{[^)]+}}) - -// CHECK: define weak void @__omp_offloading_{{[^(]+}}([10 x [10 x [10 x i32]]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i64 {{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i8* nonnull align {{[0-9]+}} dereferenceable{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]]* %{{.+}} to [10 x [10 x [10 x i32]]] addrspace(1)* -// CHECK: addrspacecast [10 x [10 x i32]]* %{{.+}} to [10 x [10 x i32]] addrspace(1)* -// CHECK: call void @__omp_offloading_{{[^(]+}}([10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]] addrspace(1)* {{[^,]+}}, i8 addrspace(1)* {{[^)]+}}) - -// CHECK: define internal void @__omp_offloading_{{[^(]+}}([10 x [10 x [10 x i32]]] addrspace(1)* noalias{{[^,]+}}, i32 addrspace(1)* noalias{{[^,]+}}, [10 x [10 x i32]] addrspace(1)* noalias{{[^,]+}}, i8 addrspace(1)* noalias{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]* -// CHECK: addrspacecast i32 addrspace(1)* %{{.+}} to i32* -// CHECK: addrspacecast [10 x [10 x i32]] addrspace(1)* %{{.+}} to [10 x [10 x i32]]* -// CHECK: call void @[[NONDEBUG_WRAPPER:.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8* {{[^)]+}}) - -// CHECK: define internal void @[[DEBUG_PARALLEL:.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* noalias{{[^,]+}}, i32 addrspace(1)* noalias{{[^,]+}}, [10 x [10 x i32]] addrspace(1)* noalias{{[^,]+}}, i8 addrspace(1)* noalias{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]* -// CHECK: addrspacecast i32 addrspace(1)* %{{.+}} to i32* -// CHECK: addrspacecast [10 x [10 x i32]] addrspace(1)* %{{.+}} to [10 x [10 x i32]]* - -// CHECK: define internal void @[[NONDEBUG_WRAPPER]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i32* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i8* nonnull align {{[0-9]+}} dereferenceable{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]]* %{{.+}} to [10 x [10 x [10 x i32]]] addrspace(1)* -// CHECK: addrspacecast i32* %{{.+}} to i32 addrspace(1)* -// CHECK: addrspacecast [10 x [10 x i32]]* %{{.+}} to [10 x [10 x i32]] addrspace(1)* -// CHECK: call void @[[DEBUG_PARALLEL]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 addrspace(1)* {{[^,]+}}, [10 x [10 x i32]] addrspace(1)* {{[^,]+}}, i8 addrspace(1)* {{[^)]+}}) - -// CHECK: define weak void @__omp_offloading_{{[^(]+}}([10 x [10 x [10 x i32]]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i32* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{[^,]+}}, i8* nonnull align {{[0-9]+}} dereferenceable{{[^)]+}}) -// CHECK: addrspacecast [10 x [10 x [10 x i32]]]* %{{.+}} to [10 x [10 x [10 x i32]]] addrspace(1)* -// CHECK: addrspacecast i32* %{{.+}} to i32 addrspace(1)* -// CHECK: addrspacecast [10 x [10 x i32]]* %{{.+}} to [10 x [10 x i32]] addrspace(1)* -// CHECK: call void @__omp_offloading_{{[^(]+}}([10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 addrspace(1)* {{[^,]+}}, [10 x [10 x i32]] addrspace(1)* {{[^,]+}}, i8 addrspace(1)* {{[^)]+}}) - -// CHECK-DAG: distinct !DISubprogram(name: "[[NONDEBUG_WRAPPER]]", -// CHECK-DAG: distinct !DISubprogram(name: "[[DEBUG_PARALLEL]]", +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__ +// CHECK-SAME: ([10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 [[A:%.*]], [10 x [10 x i32]]* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]], i1 zeroext [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG12:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]] addrspace(1)*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8 addrspace(1)*, align 8 +// CHECK-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[TMP:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[_TMP2:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK-NEXT: store [10 x [10 x [10 x i32]]] addrspace(1)* [[C]], [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META29:![0-9]+]], metadata !DIExpression()), !dbg [[DBG30:![0-9]+]] +// CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META31:![0-9]+]], metadata !DIExpression()), !dbg [[DBG32:![0-9]+]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META33:![0-9]+]], metadata !DIExpression()), !dbg [[DBG34:![0-9]+]] +// CHECK-NEXT: store i8 addrspace(1)* [[BB]], i8 addrspace(1)** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META35:![0-9]+]], metadata !DIExpression()), !dbg [[DBG36:![0-9]+]] +// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[DOTCAPTURE_EXPR_]] to i8 +// CHECK-NEXT: store i8 [[FROMBOOL]], i8* [[DOTCAPTURE_EXPR__ADDR]], align 1 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8* [[DOTCAPTURE_EXPR__ADDR]], metadata [[META37:![0-9]+]], metadata !DIExpression()), !dbg [[DBG38:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG39:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG39]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG39]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG39]] +// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG39]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP3]], [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG39]] +// CHECK-NEXT: [[TMP4:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG39]] +// CHECK-NEXT: [[TMP5:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG39]] +// CHECK-NEXT: [[TMP6:%.*]] = addrspacecast i8 addrspace(1)* [[TMP5]] to i8*, !dbg [[DBG39]] +// CHECK-NEXT: store i8* [[TMP6]], i8** [[_TMP2]], align 8, !dbg [[DBG39]] +// CHECK-NEXT: [[TMP7:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG39]] +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg [[DBG39]] +// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0), !dbg [[DBG39]] +// CHECK-NEXT: br label [[DOTEXECUTE:%.*]], !dbg [[DBG39]] +// CHECK: .execute: +// CHECK-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB4:[0-9]+]]) +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG40:![0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*, !dbg [[DBG40]] +// CHECK-NEXT: store i32 [[TMP9]], i32* [[CONV]], align 4, !dbg [[DBG40]] +// CHECK-NEXT: [[TMP10:%.*]] = load i64, i64* [[A_CASTED]], align 8, !dbg [[DBG40]] +// CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG40]] +// CHECK-NEXT: [[TMP12:%.*]] = bitcast [10 x [10 x [10 x i32]]]* [[TMP2]] to i8*, !dbg [[DBG40]] +// CHECK-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8, !dbg [[DBG40]] +// CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG40]] +// CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8*, !dbg [[DBG40]] +// CHECK-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8, !dbg [[DBG40]] +// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG40]] +// CHECK-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP4]] to i8*, !dbg [[DBG40]] +// CHECK-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8, !dbg [[DBG40]] +// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG40]] +// CHECK-NEXT: store i8* [[TMP7]], i8** [[TMP17]], align 8, !dbg [[DBG40]] +// CHECK-NEXT: [[TMP18:%.*]] = load i8, i8* [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG41:![0-9]+]] +// CHECK-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP18]] to i1, !dbg [[DBG41]] +// CHECK-NEXT: [[TMP19:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG40]] +// CHECK-NEXT: [[TMP20:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**, !dbg [[DBG40]] +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB4]], i32 [[TMP8]], i32 [[TMP19]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x [10 x [10 x i32]]]*, i64, [10 x [10 x i32]]*, i8*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP20]], i64 4), !dbg [[DBG40]] +// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]], !dbg [[DBG43:![0-9]+]] +// CHECK: .omp.deinit: +// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0), !dbg [[DBG43]] +// CHECK-NEXT: br label [[DOTEXIT:%.*]], !dbg [[DBG43]] +// CHECK: .exit: +// CHECK-NEXT: ret void, !dbg [[DBG44:![0-9]+]] +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined___debug__ +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 [[A:%.*]], [10 x [10 x i32]]* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG45:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]] addrspace(1)*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8 addrspace(1)*, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[_TMP2:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP3:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B4:%.*]] = alloca [10 x [10 x i32]], align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[F:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[G:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[H:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[D:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META52:![0-9]+]], metadata !DIExpression()), !dbg [[DBG53:![0-9]+]] +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META54:![0-9]+]], metadata !DIExpression()), !dbg [[DBG53]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]] addrspace(1)* [[C]], [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META55:![0-9]+]], metadata !DIExpression()), !dbg [[DBG56:![0-9]+]] +// CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META57:![0-9]+]], metadata !DIExpression()), !dbg [[DBG58:![0-9]+]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META59:![0-9]+]], metadata !DIExpression()), !dbg [[DBG60:![0-9]+]] +// CHECK-NEXT: store i8 addrspace(1)* [[BB]], i8 addrspace(1)** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META61:![0-9]+]], metadata !DIExpression()), !dbg [[DBG62:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG63:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG63]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG63]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG63]] +// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG63]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP3]], [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG63]] +// CHECK-NEXT: [[TMP4:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG63]] +// CHECK-NEXT: [[TMP5:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG63]] +// CHECK-NEXT: [[TMP6:%.*]] = addrspacecast i8 addrspace(1)* [[TMP5]] to i8*, !dbg [[DBG63]] +// CHECK-NEXT: store i8* [[TMP6]], i8** [[_TMP2]], align 8, !dbg [[DBG63]] +// CHECK-NEXT: [[TMP7:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG63]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_IV]], metadata [[META64:![0-9]+]], metadata !DIExpression()), !dbg [[DBG53]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_LB]], metadata [[META65:![0-9]+]], metadata !DIExpression()), !dbg [[DBG53]] +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4, !dbg [[DBG66:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_UB]], metadata [[META67:![0-9]+]], metadata !DIExpression()), !dbg [[DBG53]] +// CHECK-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4, !dbg [[DBG66]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_STRIDE]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG53]] +// CHECK-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4, !dbg [[DBG66]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_IS_LAST]], metadata [[META69:![0-9]+]], metadata !DIExpression()), !dbg [[DBG53]] +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG66]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]* [[B4]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG53]] +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [10 x [10 x i32]]* [[B4]] to i8*, !dbg [[DBG63]] +// CHECK-NEXT: [[TMP9:%.*]] = bitcast [10 x [10 x i32]]* [[TMP4]] to i8*, !dbg [[DBG63]] +// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i64 400, i1 false), !dbg [[DBG63]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[I]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG53]] +// CHECK-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG63]] +// CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !dbg [[DBG63]] +// CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG72:![0-9]+]] +// CHECK-NEXT: br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG63]] +// CHECK: omp.dispatch.cond: +// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !dbg [[DBG66]] +// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP12]], 9, !dbg [[DBG66]] +// CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG66]] +// CHECK: cond.true: +// CHECK-NEXT: br label [[COND_END:%.*]], !dbg [[DBG66]] +// CHECK: cond.false: +// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !dbg [[DBG66]] +// CHECK-NEXT: br label [[COND_END]], !dbg [[DBG66]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ], !dbg [[DBG66]] +// CHECK-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4, !dbg [[DBG66]] +// CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4, !dbg [[DBG66]] +// CHECK-NEXT: store i32 [[TMP14]], i32* [[DOTOMP_IV]], align 4, !dbg [[DBG66]] +// CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !dbg [[DBG66]] +// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !dbg [[DBG66]] +// CHECK-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG63]] +// CHECK-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG63]] +// CHECK: omp.dispatch.body: +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG63]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !dbg [[DBG66]] +// CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !dbg [[DBG66]] +// CHECK-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]], !dbg [[DBG63]] +// CHECK-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG63]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !dbg [[DBG66]] +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP19]], 1, !dbg [[DBG73:![0-9]+]] +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG73]] +// CHECK-NEXT: store i32 [[ADD]], i32* [[I]], align 4, !dbg [[DBG73]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[F]], metadata [[META74:![0-9]+]], metadata !DIExpression()), !dbg [[DBG77:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 1, !dbg [[DBG78:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG78]] +// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG78]] +// CHECK-NEXT: store i32* [[ARRAYIDX8]], i32** [[F]], align 8, !dbg [[DBG77]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[G]], metadata [[META79:![0-9]+]], metadata !DIExpression()), !dbg [[DBG80:![0-9]+]] +// CHECK-NEXT: store i32* [[A_ADDR]], i32** [[G]], align 8, !dbg [[DBG80]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[H]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[B4]], i64 0, i64 1, !dbg [[DBG83:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG83]] +// CHECK-NEXT: store i32* [[ARRAYIDX10]], i32** [[H]], align 8, !dbg [[DBG82]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[D]], metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG85:![0-9]+]] +// CHECK-NEXT: store i32 15, i32* [[D]], align 4, !dbg [[DBG85]] +// CHECK-NEXT: store i32 5, i32* [[A_ADDR]], align 4, !dbg [[DBG86:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[B4]], i64 0, i64 0, !dbg [[DBG87:![0-9]+]] +// CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG88:![0-9]+]] +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG87]] +// CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG87]] +// CHECK-NEXT: store i32 10, i32* [[ARRAYIDX12]], align 4, !dbg [[DBG89:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG90:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG90]] +// CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG91:![0-9]+]] +// CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG90]] +// CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG90]] +// CHECK-NEXT: store i32 11, i32* [[ARRAYIDX16]], align 4, !dbg [[DBG92:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG93:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG93]] +// CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG94:![0-9]+]] +// CHECK-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG93]] +// CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG93]] +// CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[ARRAYIDX20]], align 4, !dbg [[DBG93]] +// CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[B4]], i64 0, i64 0, !dbg [[DBG95:![0-9]+]] +// CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG96:![0-9]+]] +// CHECK-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP24]] to i64, !dbg [[DBG95]] +// CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG95]] +// CHECK-NEXT: store i32 [[TMP23]], i32* [[ARRAYIDX23]], align 4, !dbg [[DBG97:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[B4]], i64 0, i64 0, !dbg [[DBG98:![0-9]+]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG99:![0-9]+]] +// CHECK-NEXT: [[IDXPROM25:%.*]] = sext i32 [[TMP25]] to i64, !dbg [[DBG98]] +// CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG98]] +// CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[ARRAYIDX26]], align 4, !dbg [[DBG98]] +// CHECK-NEXT: [[TMP27:%.*]] = load i8, i8* [[TMP7]], align 1, !dbg [[DBG100:![0-9]+]] +// CHECK-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP27]] to i1, !dbg [[DBG100]] +// CHECK-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG100]] +// CHECK-NEXT: [[OR:%.*]] = or i32 [[CONV]], [[TMP26]], !dbg [[DBG100]] +// CHECK-NEXT: [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG100]] +// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG100]] +// CHECK-NEXT: store i8 [[FROMBOOL]], i8* [[TMP7]], align 1, !dbg [[DBG100]] +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG101:![0-9]+]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG72]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !dbg [[DBG66]] +// CHECK-NEXT: [[ADD28:%.*]] = add nsw i32 [[TMP28]], 1, !dbg [[DBG63]] +// CHECK-NEXT: store i32 [[ADD28]], i32* [[DOTOMP_IV]], align 4, !dbg [[DBG63]] +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]], !dbg [[DBG72]], !llvm.loop [[LOOP102:![0-9]+]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG72]] +// CHECK: omp.dispatch.inc: +// CHECK-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4, !dbg [[DBG66]] +// CHECK-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !dbg [[DBG66]] +// CHECK-NEXT: [[ADD29:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG63]] +// CHECK-NEXT: store i32 [[ADD29]], i32* [[DOTOMP_LB]], align 4, !dbg [[DBG63]] +// CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !dbg [[DBG66]] +// CHECK-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !dbg [[DBG66]] +// CHECK-NEXT: [[ADD30:%.*]] = add nsw i32 [[TMP31]], [[TMP32]], !dbg [[DBG63]] +// CHECK-NEXT: store i32 [[ADD30]], i32* [[DOTOMP_UB]], align 4, !dbg [[DBG63]] +// CHECK-NEXT: br label [[OMP_DISPATCH_COND]], !dbg [[DBG72]], !llvm.loop [[LOOP104:![0-9]+]] +// CHECK: omp.dispatch.end: +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3:[0-9]+]], i32 [[TMP11]]), !dbg [[DBG103:![0-9]+]] +// CHECK-NEXT: ret void, !dbg [[DBG105:![0-9]+]] +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG106:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META113:![0-9]+]], metadata !DIExpression()), !dbg [[DBG114:![0-9]+]] +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META115:![0-9]+]], metadata !DIExpression()), !dbg [[DBG114]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[C]], [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META116:![0-9]+]], metadata !DIExpression()), !dbg [[DBG114]] +// CHECK-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i64* [[A_ADDR]], metadata [[META117:![0-9]+]], metadata !DIExpression()), !dbg [[DBG114]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META118:![0-9]+]], metadata !DIExpression()), !dbg [[DBG114]] +// CHECK-NEXT: store i8* [[BB]], i8** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META119:![0-9]+]], metadata !DIExpression()), !dbg [[DBG114]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG120:![0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*, !dbg [[DBG120]] +// CHECK-NEXT: [[TMP1:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG120]] +// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG120]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG120]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG120]] +// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG120]] +// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8, !dbg [[DBG120]] +// CHECK-NEXT: [[TMP7:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG120]] +// CHECK-NEXT: [[TMP8:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG120]] +// CHECK-NEXT: [[TMP9:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP5]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG120]] +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast i8* [[TMP8]] to i8 addrspace(1)*, !dbg [[DBG120]] +// CHECK-NEXT: call void @__omp_outlined___debug__(i32* [[TMP3]], i32* [[TMP4]], [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP9]], i32 [[TMP6]], [10 x [10 x i32]]* [[TMP7]], i8 addrspace(1)* [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG120]] +// CHECK-NEXT: ret void, !dbg [[DBG120]] +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13 +// CHECK-SAME: ([10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] !dbg [[DBG121:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[C]], [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META124:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125:![0-9]+]] +// CHECK-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i64* [[A_ADDR]], metadata [[META126:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META127:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]] +// CHECK-NEXT: store i8* [[BB]], i8** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META128:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]] +// CHECK-NEXT: store i64 [[DOTCAPTURE_EXPR_]], i64* [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i64* [[DOTCAPTURE_EXPR__ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG130:![0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*, !dbg [[DBG130]] +// CHECK-NEXT: [[TMP1:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG130]] +// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG130]] +// CHECK-NEXT: [[CONV1:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i8*, !dbg [[DBG130]] +// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG130]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV]], align 8, !dbg [[DBG130]] +// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG130]] +// CHECK-NEXT: [[TMP6:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG130]] +// CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[CONV1]], align 8, !dbg [[DBG130]] +// CHECK-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP7]] to i1, !dbg [[DBG130]] +// CHECK-NEXT: [[TMP8:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP3]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG130]] +// CHECK-NEXT: [[TMP9:%.*]] = addrspacecast i8* [[TMP6]] to i8 addrspace(1)*, !dbg [[DBG130]] +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__([10 x [10 x [10 x i32]]] addrspace(1)* [[TMP8]], i32 [[TMP4]], [10 x [10 x i32]]* [[TMP5]], i8 addrspace(1)* [[TMP9]], i1 [[TOBOOL]]) #[[ATTR4]], !dbg [[DBG130]] +// CHECK-NEXT: ret void, !dbg [[DBG130]] +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__ +// CHECK-SAME: ([10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 [[A:%.*]], [10 x [10 x i32]] addrspace(1)* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG131:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]] addrspace(1)*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]] addrspace(1)*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8 addrspace(1)*, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[_TMP2:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK-NEXT: store [10 x [10 x [10 x i32]]] addrspace(1)* [[C]], [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META136:![0-9]+]], metadata !DIExpression()), !dbg [[DBG137:![0-9]+]] +// CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139:![0-9]+]] +// CHECK-NEXT: store [10 x [10 x i32]] addrspace(1)* [[B]], [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141:![0-9]+]] +// CHECK-NEXT: store i8 addrspace(1)* [[BB]], i8 addrspace(1)** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG143:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG144:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG144]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG144]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG144]] +// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x i32]] addrspace(1)*, [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8, !dbg [[DBG144]] +// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast [10 x [10 x i32]] addrspace(1)* [[TMP3]] to [10 x [10 x i32]]*, !dbg [[DBG144]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP4]], [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG144]] +// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG144]] +// CHECK-NEXT: [[TMP6:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG144]] +// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast i8 addrspace(1)* [[TMP6]] to i8*, !dbg [[DBG144]] +// CHECK-NEXT: store i8* [[TMP7]], i8** [[_TMP2]], align 8, !dbg [[DBG144]] +// CHECK-NEXT: [[TMP8:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG144]] +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg [[DBG144]] +// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0), !dbg [[DBG144]] +// CHECK-NEXT: br label [[DOTEXECUTE:%.*]], !dbg [[DBG144]] +// CHECK: .execute: +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB9:[0-9]+]]) +// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG145:![0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*, !dbg [[DBG145]] +// CHECK-NEXT: store i32 [[TMP10]], i32* [[CONV]], align 4, !dbg [[DBG145]] +// CHECK-NEXT: [[TMP11:%.*]] = load i64, i64* [[A_CASTED]], align 8, !dbg [[DBG145]] +// CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG145]] +// CHECK-NEXT: [[TMP13:%.*]] = bitcast [10 x [10 x [10 x i32]]]* [[TMP2]] to i8*, !dbg [[DBG145]] +// CHECK-NEXT: store i8* [[TMP13]], i8** [[TMP12]], align 8, !dbg [[DBG145]] +// CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG145]] +// CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP11]] to i8*, !dbg [[DBG145]] +// CHECK-NEXT: store i8* [[TMP15]], i8** [[TMP14]], align 8, !dbg [[DBG145]] +// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG145]] +// CHECK-NEXT: [[TMP17:%.*]] = bitcast [10 x [10 x i32]]* [[TMP5]] to i8*, !dbg [[DBG145]] +// CHECK-NEXT: store i8* [[TMP17]], i8** [[TMP16]], align 8, !dbg [[DBG145]] +// CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG145]] +// CHECK-NEXT: store i8* [[TMP8]], i8** [[TMP18]], align 8, !dbg [[DBG145]] +// CHECK-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**, !dbg [[DBG145]] +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB9]], i32 [[TMP9]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x [10 x [10 x i32]]]*, i64, [10 x [10 x i32]]*, i8*)* @__omp_outlined__2 to i8*), i8* null, i8** [[TMP19]], i64 4), !dbg [[DBG145]] +// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]], !dbg [[DBG146:![0-9]+]] +// CHECK: .omp.deinit: +// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0), !dbg [[DBG146]] +// CHECK-NEXT: br label [[DOTEXIT:%.*]], !dbg [[DBG146]] +// CHECK: .exit: +// CHECK-NEXT: ret void, !dbg [[DBG148:![0-9]+]] +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined___debug__1 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 [[A:%.*]], [10 x [10 x i32]] addrspace(1)* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG149:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]] addrspace(1)*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]] addrspace(1)*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8 addrspace(1)*, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[_TMP2:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP3:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[F:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[G:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[H:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[D:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META152:![0-9]+]], metadata !DIExpression()), !dbg [[DBG153:![0-9]+]] +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META154:![0-9]+]], metadata !DIExpression()), !dbg [[DBG153]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]] addrspace(1)* [[C]], [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META155:![0-9]+]], metadata !DIExpression()), !dbg [[DBG156:![0-9]+]] +// CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META157:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158:![0-9]+]] +// CHECK-NEXT: store [10 x [10 x i32]] addrspace(1)* [[B]], [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]] +// CHECK-NEXT: store i8 addrspace(1)* [[BB]], i8 addrspace(1)** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META161:![0-9]+]], metadata !DIExpression()), !dbg [[DBG162:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG163:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG163]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG163]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG163]] +// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x i32]] addrspace(1)*, [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8, !dbg [[DBG163]] +// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast [10 x [10 x i32]] addrspace(1)* [[TMP3]] to [10 x [10 x i32]]*, !dbg [[DBG163]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP4]], [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG163]] +// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG163]] +// CHECK-NEXT: [[TMP6:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG163]] +// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast i8 addrspace(1)* [[TMP6]] to i8*, !dbg [[DBG163]] +// CHECK-NEXT: store i8* [[TMP7]], i8** [[_TMP2]], align 8, !dbg [[DBG163]] +// CHECK-NEXT: [[TMP8:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG163]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_IV]], metadata [[META164:![0-9]+]], metadata !DIExpression()), !dbg [[DBG153]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_LB]], metadata [[META165:![0-9]+]], metadata !DIExpression()), !dbg [[DBG153]] +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4, !dbg [[DBG166:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_UB]], metadata [[META167:![0-9]+]], metadata !DIExpression()), !dbg [[DBG153]] +// CHECK-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4, !dbg [[DBG166]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_STRIDE]], metadata [[META168:![0-9]+]], metadata !DIExpression()), !dbg [[DBG153]] +// CHECK-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4, !dbg [[DBG166]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_IS_LAST]], metadata [[META169:![0-9]+]], metadata !DIExpression()), !dbg [[DBG153]] +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG166]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[I]], metadata [[META170:![0-9]+]], metadata !DIExpression()), !dbg [[DBG153]] +// CHECK-NEXT: [[TMP9:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG163]] +// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !dbg [[DBG163]] +// CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB6:[0-9]+]], i32 [[TMP10]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG171:![0-9]+]] +// CHECK-NEXT: br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG163]] +// CHECK: omp.dispatch.cond: +// CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !dbg [[DBG166]] +// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG166]] +// CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG166]] +// CHECK: cond.true: +// CHECK-NEXT: br label [[COND_END:%.*]], !dbg [[DBG166]] +// CHECK: cond.false: +// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !dbg [[DBG166]] +// CHECK-NEXT: br label [[COND_END]], !dbg [[DBG166]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG166]] +// CHECK-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4, !dbg [[DBG166]] +// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4, !dbg [[DBG166]] +// CHECK-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4, !dbg [[DBG166]] +// CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !dbg [[DBG166]] +// CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !dbg [[DBG166]] +// CHECK-NEXT: [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG163]] +// CHECK-NEXT: br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG163]] +// CHECK: omp.dispatch.body: +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG163]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !dbg [[DBG166]] +// CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !dbg [[DBG166]] +// CHECK-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG163]] +// CHECK-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG163]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !dbg [[DBG166]] +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG172:![0-9]+]] +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG172]] +// CHECK-NEXT: store i32 [[ADD]], i32* [[I]], align 4, !dbg [[DBG172]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[F]], metadata [[META173:![0-9]+]], metadata !DIExpression()), !dbg [[DBG175:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 1, !dbg [[DBG176:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG176]] +// CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG176]] +// CHECK-NEXT: store i32* [[ARRAYIDX7]], i32** [[F]], align 8, !dbg [[DBG175]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[G]], metadata [[META177:![0-9]+]], metadata !DIExpression()), !dbg [[DBG178:![0-9]+]] +// CHECK-NEXT: store i32* [[A_ADDR]], i32** [[G]], align 8, !dbg [[DBG178]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[H]], metadata [[META179:![0-9]+]], metadata !DIExpression()), !dbg [[DBG180:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP5]], i64 0, i64 1, !dbg [[DBG181:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG181]] +// CHECK-NEXT: store i32* [[ARRAYIDX9]], i32** [[H]], align 8, !dbg [[DBG180]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[D]], metadata [[META182:![0-9]+]], metadata !DIExpression()), !dbg [[DBG183:![0-9]+]] +// CHECK-NEXT: store i32 15, i32* [[D]], align 4, !dbg [[DBG183]] +// CHECK-NEXT: store i32 5, i32* [[A_ADDR]], align 4, !dbg [[DBG184:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP5]], i64 0, i64 0, !dbg [[DBG185:![0-9]+]] +// CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG186:![0-9]+]] +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG185]] +// CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG185]] +// CHECK-NEXT: store i32 10, i32* [[ARRAYIDX11]], align 4, !dbg [[DBG187:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG188:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG188]] +// CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG189:![0-9]+]] +// CHECK-NEXT: [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG188]] +// CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG188]] +// CHECK-NEXT: store i32 11, i32* [[ARRAYIDX15]], align 4, !dbg [[DBG190:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG191:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG191]] +// CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG192:![0-9]+]] +// CHECK-NEXT: [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG191]] +// CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG191]] +// CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX19]], align 4, !dbg [[DBG191]] +// CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP5]], i64 0, i64 0, !dbg [[DBG193:![0-9]+]] +// CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG194:![0-9]+]] +// CHECK-NEXT: [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG193]] +// CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG193]] +// CHECK-NEXT: store i32 [[TMP22]], i32* [[ARRAYIDX22]], align 4, !dbg [[DBG195:![0-9]+]] +// CHECK-NEXT: [[TMP24:%.*]] = load i8, i8* [[TMP8]], align 1, !dbg [[DBG196:![0-9]+]] +// CHECK-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG196]] +// CHECK-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG196]] +// CHECK-NEXT: store i32 [[CONV]], i32* [[D]], align 4, !dbg [[DBG197:![0-9]+]] +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG198:![0-9]+]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG171]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !dbg [[DBG166]] +// CHECK-NEXT: [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG163]] +// CHECK-NEXT: store i32 [[ADD23]], i32* [[DOTOMP_IV]], align 4, !dbg [[DBG163]] +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]], !dbg [[DBG171]], !llvm.loop [[LOOP199:![0-9]+]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG171]] +// CHECK: omp.dispatch.inc: +// CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4, !dbg [[DBG166]] +// CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !dbg [[DBG166]] +// CHECK-NEXT: [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG163]] +// CHECK-NEXT: store i32 [[ADD24]], i32* [[DOTOMP_LB]], align 4, !dbg [[DBG163]] +// CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !dbg [[DBG166]] +// CHECK-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !dbg [[DBG166]] +// CHECK-NEXT: [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG163]] +// CHECK-NEXT: store i32 [[ADD25]], i32* [[DOTOMP_UB]], align 4, !dbg [[DBG163]] +// CHECK-NEXT: br label [[OMP_DISPATCH_COND]], !dbg [[DBG171]], !llvm.loop [[LOOP201:![0-9]+]] +// CHECK: omp.dispatch.end: +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB8:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG200:![0-9]+]] +// CHECK-NEXT: ret void, !dbg [[DBG202:![0-9]+]] +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG203:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META204:![0-9]+]], metadata !DIExpression()), !dbg [[DBG205:![0-9]+]] +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META206:![0-9]+]], metadata !DIExpression()), !dbg [[DBG205]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[C]], [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META207:![0-9]+]], metadata !DIExpression()), !dbg [[DBG205]] +// CHECK-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i64* [[A_ADDR]], metadata [[META208:![0-9]+]], metadata !DIExpression()), !dbg [[DBG205]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META209:![0-9]+]], metadata !DIExpression()), !dbg [[DBG205]] +// CHECK-NEXT: store i8* [[BB]], i8** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META210:![0-9]+]], metadata !DIExpression()), !dbg [[DBG205]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG211:![0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP1:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP7:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP8:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP9:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP5]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast [10 x [10 x i32]]* [[TMP7]] to [10 x [10 x i32]] addrspace(1)*, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP11:%.*]] = addrspacecast i8* [[TMP8]] to i8 addrspace(1)*, !dbg [[DBG211]] +// CHECK-NEXT: call void @__omp_outlined___debug__1(i32* [[TMP3]], i32* [[TMP4]], [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP9]], i32 [[TMP6]], [10 x [10 x i32]] addrspace(1)* [[TMP10]], i8 addrspace(1)* [[TMP11]]) #[[ATTR4]], !dbg [[DBG211]] +// CHECK-NEXT: ret void, !dbg [[DBG211]] +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27 +// CHECK-SAME: ([10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG212:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[C]], [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META215:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216:![0-9]+]] +// CHECK-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i64* [[A_ADDR]], metadata [[META217:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META218:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]] +// CHECK-NEXT: store i8* [[BB]], i8** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META219:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG220:![0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*, !dbg [[DBG220]] +// CHECK-NEXT: [[TMP1:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG220]] +// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG220]] +// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG220]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV]], align 8, !dbg [[DBG220]] +// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG220]] +// CHECK-NEXT: [[TMP6:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG220]] +// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP3]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG220]] +// CHECK-NEXT: [[TMP8:%.*]] = addrspacecast [10 x [10 x i32]]* [[TMP5]] to [10 x [10 x i32]] addrspace(1)*, !dbg [[DBG220]] +// CHECK-NEXT: [[TMP9:%.*]] = addrspacecast i8* [[TMP6]] to i8 addrspace(1)*, !dbg [[DBG220]] +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__([10 x [10 x [10 x i32]]] addrspace(1)* [[TMP7]], i32 [[TMP4]], [10 x [10 x i32]] addrspace(1)* [[TMP8]], i8 addrspace(1)* [[TMP9]]) #[[ATTR4]], !dbg [[DBG220]] +// CHECK-NEXT: ret void, !dbg [[DBG220]] +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__ +// CHECK-SAME: ([10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 addrspace(1)* noalias [[A:%.*]], [10 x [10 x i32]] addrspace(1)* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG221:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]] addrspace(1)*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32 addrspace(1)*, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]] addrspace(1)*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8 addrspace(1)*, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[_TMP2:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[_TMP3:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK-NEXT: store [10 x [10 x [10 x i32]]] addrspace(1)* [[C]], [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META226:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227:![0-9]+]] +// CHECK-NEXT: store i32 addrspace(1)* [[A]], i32 addrspace(1)** [[A_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32 addrspace(1)** [[A_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG229:![0-9]+]] +// CHECK-NEXT: store [10 x [10 x i32]] addrspace(1)* [[B]], [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], metadata [[META230:![0-9]+]], metadata !DIExpression()), !dbg [[DBG231:![0-9]+]] +// CHECK-NEXT: store i8 addrspace(1)* [[BB]], i8 addrspace(1)** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META232:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG234:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG234]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG234]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG234]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)** [[A_ADDR]], align 8, !dbg [[DBG234]] +// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast i32 addrspace(1)* [[TMP3]] to i32*, !dbg [[DBG234]] +// CHECK-NEXT: store i32* [[TMP4]], i32** [[_TMP1]], align 8, !dbg [[DBG234]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[_TMP1]], align 8, !dbg [[DBG234]] +// CHECK-NEXT: [[TMP6:%.*]] = load [10 x [10 x i32]] addrspace(1)*, [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8, !dbg [[DBG234]] +// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast [10 x [10 x i32]] addrspace(1)* [[TMP6]] to [10 x [10 x i32]]*, !dbg [[DBG234]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP7]], [10 x [10 x i32]]** [[_TMP2]], align 8, !dbg [[DBG234]] +// CHECK-NEXT: [[TMP8:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP2]], align 8, !dbg [[DBG234]] +// CHECK-NEXT: [[TMP9:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG234]] +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast i8 addrspace(1)* [[TMP9]] to i8*, !dbg [[DBG234]] +// CHECK-NEXT: store i8* [[TMP10]], i8** [[_TMP3]], align 8, !dbg [[DBG234]] +// CHECK-NEXT: [[TMP11:%.*]] = load i8*, i8** [[_TMP3]], align 8, !dbg [[DBG234]] +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg [[DBG234]] +// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0), !dbg [[DBG234]] +// CHECK-NEXT: br label [[DOTEXECUTE:%.*]], !dbg [[DBG234]] +// CHECK: .execute: +// CHECK-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB14:[0-9]+]]) +// CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG235:![0-9]+]] +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [10 x [10 x [10 x i32]]]* [[TMP2]] to i8*, !dbg [[DBG235]] +// CHECK-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8, !dbg [[DBG235]] +// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG235]] +// CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP5]] to i8*, !dbg [[DBG235]] +// CHECK-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8, !dbg [[DBG235]] +// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG235]] +// CHECK-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP8]] to i8*, !dbg [[DBG235]] +// CHECK-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8, !dbg [[DBG235]] +// CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG235]] +// CHECK-NEXT: store i8* [[TMP11]], i8** [[TMP19]], align 8, !dbg [[DBG235]] +// CHECK-NEXT: [[TMP20:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**, !dbg [[DBG235]] +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB14]], i32 [[TMP12]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x [10 x [10 x i32]]]*, i32*, [10 x [10 x i32]]*, i8*)* @__omp_outlined__4 to i8*), i8* null, i8** [[TMP20]], i64 4), !dbg [[DBG235]] +// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]], !dbg [[DBG236:![0-9]+]] +// CHECK: .omp.deinit: +// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0), !dbg [[DBG236]] +// CHECK-NEXT: br label [[DOTEXIT:%.*]], !dbg [[DBG236]] +// CHECK: .exit: +// CHECK-NEXT: ret void, !dbg [[DBG238:![0-9]+]] +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined___debug__3 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 addrspace(1)* noalias [[A:%.*]], [10 x [10 x i32]] addrspace(1)* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG239:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]] addrspace(1)*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32 addrspace(1)*, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]] addrspace(1)*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8 addrspace(1)*, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[_TMP2:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[_TMP3:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP4:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[F:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[G:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[H:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[D:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META242:![0-9]+]], metadata !DIExpression()), !dbg [[DBG243:![0-9]+]] +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META244:![0-9]+]], metadata !DIExpression()), !dbg [[DBG243]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]] addrspace(1)* [[C]], [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META245:![0-9]+]], metadata !DIExpression()), !dbg [[DBG246:![0-9]+]] +// CHECK-NEXT: store i32 addrspace(1)* [[A]], i32 addrspace(1)** [[A_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32 addrspace(1)** [[A_ADDR]], metadata [[META247:![0-9]+]], metadata !DIExpression()), !dbg [[DBG248:![0-9]+]] +// CHECK-NEXT: store [10 x [10 x i32]] addrspace(1)* [[B]], [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], metadata [[META249:![0-9]+]], metadata !DIExpression()), !dbg [[DBG250:![0-9]+]] +// CHECK-NEXT: store i8 addrspace(1)* [[BB]], i8 addrspace(1)** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META251:![0-9]+]], metadata !DIExpression()), !dbg [[DBG252:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG253:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG253]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG253]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG253]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)** [[A_ADDR]], align 8, !dbg [[DBG253]] +// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast i32 addrspace(1)* [[TMP3]] to i32*, !dbg [[DBG253]] +// CHECK-NEXT: store i32* [[TMP4]], i32** [[_TMP1]], align 8, !dbg [[DBG253]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[_TMP1]], align 8, !dbg [[DBG253]] +// CHECK-NEXT: [[TMP6:%.*]] = load [10 x [10 x i32]] addrspace(1)*, [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8, !dbg [[DBG253]] +// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast [10 x [10 x i32]] addrspace(1)* [[TMP6]] to [10 x [10 x i32]]*, !dbg [[DBG253]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP7]], [10 x [10 x i32]]** [[_TMP2]], align 8, !dbg [[DBG253]] +// CHECK-NEXT: [[TMP8:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP2]], align 8, !dbg [[DBG253]] +// CHECK-NEXT: [[TMP9:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG253]] +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast i8 addrspace(1)* [[TMP9]] to i8*, !dbg [[DBG253]] +// CHECK-NEXT: store i8* [[TMP10]], i8** [[_TMP3]], align 8, !dbg [[DBG253]] +// CHECK-NEXT: [[TMP11:%.*]] = load i8*, i8** [[_TMP3]], align 8, !dbg [[DBG253]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_IV]], metadata [[META254:![0-9]+]], metadata !DIExpression()), !dbg [[DBG243]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_LB]], metadata [[META255:![0-9]+]], metadata !DIExpression()), !dbg [[DBG243]] +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4, !dbg [[DBG256:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_UB]], metadata [[META257:![0-9]+]], metadata !DIExpression()), !dbg [[DBG243]] +// CHECK-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4, !dbg [[DBG256]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_STRIDE]], metadata [[META258:![0-9]+]], metadata !DIExpression()), !dbg [[DBG243]] +// CHECK-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4, !dbg [[DBG256]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_IS_LAST]], metadata [[META259:![0-9]+]], metadata !DIExpression()), !dbg [[DBG243]] +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG256]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[I]], metadata [[META260:![0-9]+]], metadata !DIExpression()), !dbg [[DBG243]] +// CHECK-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG253]] +// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !dbg [[DBG253]] +// CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB11:[0-9]+]], i32 [[TMP13]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG261:![0-9]+]] +// CHECK-NEXT: br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG253]] +// CHECK: omp.dispatch.cond: +// CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !dbg [[DBG256]] +// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 9, !dbg [[DBG256]] +// CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG256]] +// CHECK: cond.true: +// CHECK-NEXT: br label [[COND_END:%.*]], !dbg [[DBG256]] +// CHECK: cond.false: +// CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !dbg [[DBG256]] +// CHECK-NEXT: br label [[COND_END]], !dbg [[DBG256]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ], !dbg [[DBG256]] +// CHECK-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4, !dbg [[DBG256]] +// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4, !dbg [[DBG256]] +// CHECK-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4, !dbg [[DBG256]] +// CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !dbg [[DBG256]] +// CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !dbg [[DBG256]] +// CHECK-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]], !dbg [[DBG253]] +// CHECK-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG253]] +// CHECK: omp.dispatch.body: +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG253]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !dbg [[DBG256]] +// CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !dbg [[DBG256]] +// CHECK-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP19]], [[TMP20]], !dbg [[DBG253]] +// CHECK-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG253]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !dbg [[DBG256]] +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG262:![0-9]+]] +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG262]] +// CHECK-NEXT: store i32 [[ADD]], i32* [[I]], align 4, !dbg [[DBG262]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[F]], metadata [[META263:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 1, !dbg [[DBG266:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG266]] +// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG266]] +// CHECK-NEXT: store i32* [[ARRAYIDX8]], i32** [[F]], align 8, !dbg [[DBG265]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[G]], metadata [[META267:![0-9]+]], metadata !DIExpression()), !dbg [[DBG268:![0-9]+]] +// CHECK-NEXT: store i32* [[TMP5]], i32** [[G]], align 8, !dbg [[DBG268]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[H]], metadata [[META269:![0-9]+]], metadata !DIExpression()), !dbg [[DBG270:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP8]], i64 0, i64 1, !dbg [[DBG271:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG271]] +// CHECK-NEXT: store i32* [[ARRAYIDX10]], i32** [[H]], align 8, !dbg [[DBG270]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[D]], metadata [[META272:![0-9]+]], metadata !DIExpression()), !dbg [[DBG273:![0-9]+]] +// CHECK-NEXT: store i32 15, i32* [[D]], align 4, !dbg [[DBG273]] +// CHECK-NEXT: store i32 5, i32* [[TMP5]], align 4, !dbg [[DBG274:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP8]], i64 0, i64 0, !dbg [[DBG275:![0-9]+]] +// CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG276:![0-9]+]] +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG275]] +// CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG275]] +// CHECK-NEXT: store i32 10, i32* [[ARRAYIDX12]], align 4, !dbg [[DBG277:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG278:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG278]] +// CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG279:![0-9]+]] +// CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG278]] +// CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG278]] +// CHECK-NEXT: store i32 11, i32* [[ARRAYIDX16]], align 4, !dbg [[DBG280:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG281:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG281]] +// CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG282:![0-9]+]] +// CHECK-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP24]] to i64, !dbg [[DBG281]] +// CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG281]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[ARRAYIDX20]], align 4, !dbg [[DBG281]] +// CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP8]], i64 0, i64 0, !dbg [[DBG283:![0-9]+]] +// CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG284:![0-9]+]] +// CHECK-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP26]] to i64, !dbg [[DBG283]] +// CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG283]] +// CHECK-NEXT: store i32 [[TMP25]], i32* [[ARRAYIDX23]], align 4, !dbg [[DBG285:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP8]], i64 0, i64 0, !dbg [[DBG286:![0-9]+]] +// CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG287:![0-9]+]] +// CHECK-NEXT: [[IDXPROM25:%.*]] = sext i32 [[TMP27]] to i64, !dbg [[DBG286]] +// CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG286]] +// CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[ARRAYIDX26]], align 4, !dbg [[DBG286]] +// CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP28]], 0, !dbg [[DBG286]] +// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG288:![0-9]+]] +// CHECK-NEXT: store i8 [[FROMBOOL]], i8* [[TMP11]], align 1, !dbg [[DBG288]] +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG289:![0-9]+]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG261]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !dbg [[DBG256]] +// CHECK-NEXT: [[ADD27:%.*]] = add nsw i32 [[TMP29]], 1, !dbg [[DBG253]] +// CHECK-NEXT: store i32 [[ADD27]], i32* [[DOTOMP_IV]], align 4, !dbg [[DBG253]] +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]], !dbg [[DBG261]], !llvm.loop [[LOOP290:![0-9]+]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG261]] +// CHECK: omp.dispatch.inc: +// CHECK-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4, !dbg [[DBG256]] +// CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !dbg [[DBG256]] +// CHECK-NEXT: [[ADD28:%.*]] = add nsw i32 [[TMP30]], [[TMP31]], !dbg [[DBG253]] +// CHECK-NEXT: store i32 [[ADD28]], i32* [[DOTOMP_LB]], align 4, !dbg [[DBG253]] +// CHECK-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !dbg [[DBG256]] +// CHECK-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !dbg [[DBG256]] +// CHECK-NEXT: [[ADD29:%.*]] = add nsw i32 [[TMP32]], [[TMP33]], !dbg [[DBG253]] +// CHECK-NEXT: store i32 [[ADD29]], i32* [[DOTOMP_UB]], align 4, !dbg [[DBG253]] +// CHECK-NEXT: br label [[OMP_DISPATCH_COND]], !dbg [[DBG261]], !llvm.loop [[LOOP292:![0-9]+]] +// CHECK: omp.dispatch.end: +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB13:[0-9]+]], i32 [[TMP13]]), !dbg [[DBG291:![0-9]+]] +// CHECK-NEXT: ret void, !dbg [[DBG293:![0-9]+]] +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG294:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META297:![0-9]+]], metadata !DIExpression()), !dbg [[DBG298:![0-9]+]] +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META299:![0-9]+]], metadata !DIExpression()), !dbg [[DBG298]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[C]], [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META300:![0-9]+]], metadata !DIExpression()), !dbg [[DBG298]] +// CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[A_ADDR]], metadata [[META301:![0-9]+]], metadata !DIExpression()), !dbg [[DBG298]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META302:![0-9]+]], metadata !DIExpression()), !dbg [[DBG298]] +// CHECK-NEXT: store i8* [[BB]], i8** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META303:![0-9]+]], metadata !DIExpression()), !dbg [[DBG298]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG304:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8, !dbg [[DBG304]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG304]] +// CHECK-NEXT: [[TMP3:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG304]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG304]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG304]] +// CHECK-NEXT: [[TMP6:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG304]] +// CHECK-NEXT: [[TMP7:%.*]] = load i32*, i32** [[A_ADDR]], align 8, !dbg [[DBG304]] +// CHECK-NEXT: [[TMP8:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG304]] +// CHECK-NEXT: [[TMP9:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG304]] +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP6]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG304]] +// CHECK-NEXT: [[TMP11:%.*]] = addrspacecast i32* [[TMP7]] to i32 addrspace(1)*, !dbg [[DBG304]] +// CHECK-NEXT: [[TMP12:%.*]] = addrspacecast [10 x [10 x i32]]* [[TMP8]] to [10 x [10 x i32]] addrspace(1)*, !dbg [[DBG304]] +// CHECK-NEXT: [[TMP13:%.*]] = addrspacecast i8* [[TMP9]] to i8 addrspace(1)*, !dbg [[DBG304]] +// CHECK-NEXT: call void @__omp_outlined___debug__3(i32* [[TMP4]], i32* [[TMP5]], [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP10]], i32 addrspace(1)* [[TMP11]], [10 x [10 x i32]] addrspace(1)* [[TMP12]], i8 addrspace(1)* [[TMP13]]) #[[ATTR4]], !dbg [[DBG304]] +// CHECK-NEXT: ret void, !dbg [[DBG304]] +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41 +// CHECK-SAME: ([10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG305:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[C]], [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META308:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309:![0-9]+]] +// CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[A_ADDR]], metadata [[META310:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META311:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]] +// CHECK-NEXT: store i8* [[BB]], i8** [[BB_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META312:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG313:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8, !dbg [[DBG313]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG313]] +// CHECK-NEXT: [[TMP3:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG313]] +// CHECK-NEXT: [[TMP4:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG313]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[A_ADDR]], align 8, !dbg [[DBG313]] +// CHECK-NEXT: [[TMP6:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG313]] +// CHECK-NEXT: [[TMP7:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG313]] +// CHECK-NEXT: [[TMP8:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP4]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG313]] +// CHECK-NEXT: [[TMP9:%.*]] = addrspacecast i32* [[TMP5]] to i32 addrspace(1)*, !dbg [[DBG313]] +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast [10 x [10 x i32]]* [[TMP6]] to [10 x [10 x i32]] addrspace(1)*, !dbg [[DBG313]] +// CHECK-NEXT: [[TMP11:%.*]] = addrspacecast i8* [[TMP7]] to i8 addrspace(1)*, !dbg [[DBG313]] +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__([10 x [10 x [10 x i32]]] addrspace(1)* [[TMP8]], i32 addrspace(1)* [[TMP9]], [10 x [10 x i32]] addrspace(1)* [[TMP10]], i8 addrspace(1)* [[TMP11]]) #[[ATTR4]], !dbg [[DBG313]] +// CHECK-NEXT: ret void, !dbg [[DBG313]] +// diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -414,6 +414,8 @@ __OMP_RTL(__kmpc_spmd_kernel_init, false, Void, Int32, Int16) __OMP_RTL(__kmpc_spmd_kernel_deinit_v2, false, Void, Int16) __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr) +__OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32, + VoidPtr, VoidPtr, VoidPtrPtr, SizeTy) __OMP_RTL(__kmpc_kernel_parallel, false, Int1, VoidPtrPtr) __OMP_RTL(__kmpc_kernel_end_parallel, false, Void, ) __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32) diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -1651,10 +1651,12 @@ // Allow direct calls. if (CB->isCallee(&U)) return getUniqueKernelFor(*CB); - // Allow the use in __kmpc_kernel_prepare_parallel calls. - if (Function *Callee = CB->getCalledFunction()) - if (Callee->getName() == "__kmpc_kernel_prepare_parallel") - return getUniqueKernelFor(*CB); + + OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI = + OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51]; + // Allow the use in __kmpc_parallel_51 calls. + if (OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI)) + return getUniqueKernelFor(*CB); return nullptr; } // Disallow every other use. @@ -1678,19 +1680,19 @@ } bool OpenMPOpt::rewriteDeviceCodeStateMachine() { - OMPInformationCache::RuntimeFunctionInfo &KernelPrepareParallelRFI = - OMPInfoCache.RFIs[OMPRTL___kmpc_kernel_prepare_parallel]; + OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI = + OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51]; bool Changed = false; - if (!KernelPrepareParallelRFI) + if (!KernelParallelRFI) return Changed; for (Function *F : SCC) { - // Check if the function is uses in a __kmpc_kernel_prepare_parallel call at + // Check if the function is a use in a __kmpc_parallel_51 call at // all. bool UnknownUse = false; - bool KernelPrepareUse = false; + bool KernelParallelUse = false; unsigned NumDirectCalls = 0; SmallVector ToBeReplacedStateMachineUses; @@ -1705,25 +1707,30 @@ ToBeReplacedStateMachineUses.push_back(&U); return; } - if (!KernelPrepareUse && OpenMPOpt::getCallIfRegularCall( - *U.getUser(), &KernelPrepareParallelRFI)) { - KernelPrepareUse = true; + + // Find wrapper functions that represent parallel kernels. + CallInst *CI = + OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI); + const unsigned int WrapperFunctionArgNo = 6; + if (!KernelParallelUse && CI && + CI->getArgOperandNo(&U) == WrapperFunctionArgNo) { + KernelParallelUse = true; ToBeReplacedStateMachineUses.push_back(&U); return; } UnknownUse = true; }); - // Do not emit a remark if we haven't seen a __kmpc_kernel_prepare_parallel + // Do not emit a remark if we haven't seen a __kmpc_parallel_51 // use. - if (!KernelPrepareUse) + if (!KernelParallelUse) continue; { auto Remark = [&](OptimizationRemark OR) { return OR << "Found a parallel region that is called in a target " "region but not part of a combined target construct nor " - "nesed inside a target construct without intermediate " + "nested inside a target construct without intermediate " "code. This can lead to excessive register usage for " "unrelated target regions in the same translation unit " "due to spurious call edges assumed by ptxas."; @@ -1747,7 +1754,7 @@ continue; } - // Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give + // Even if we have __kmpc_parallel_51 calls, we (for now) give // up if the function is not called from a unique kernel. Kernel K = getUniqueKernelFor(*F); if (!K) { diff --git a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll --- a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll +++ b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll @@ -29,14 +29,17 @@ ; CHECK-DAG: icmp eq i8* %5, @__omp_outlined__1_wrapper.ID ; CHECK-DAG: icmp eq i8* %7, @__omp_outlined__3_wrapper.ID -; CHECK-DAG: call void @__kmpc_kernel_prepare_parallel(i8* @__omp_outlined__1_wrapper.ID) -; CHECK-DAG: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void ()* @__omp_outlined__2_wrapper to i8*)) -; CHECK-DAG: call void @__kmpc_kernel_prepare_parallel(i8* @__omp_outlined__3_wrapper.ID) +; CHECK-DAG: call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* @__omp_outlined__1_wrapper.ID, i8** %2, i64 0) +; CHECK-DAG: call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** %1, i64 0) +; CHECK-DAG: call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* @__omp_outlined__3_wrapper.ID, i8** %3, i64 0) %struct.ident_t = type { i32, i32, i32, i32, i8* } -define internal void @__omp_offloading_35_a1e179_foo_l7_worker() { +@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 + +define internal void @__omp_offloading_50_6dfa0f01_foo_l6_worker() { entry: %work_fn = alloca i8*, align 8 %exec_status = alloca i8, align 1 @@ -59,36 +62,36 @@ br i1 %is_active, label %.execute.parallel, label %.barrier.parallel .execute.parallel: ; preds = %.select.workers - %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* null) + %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) %5 = load i8*, i8** %work_fn, align 8 - %work_match = icmp eq i8* %5, bitcast (void ()* @__omp_outlined__1_wrapper to i8*) + %work_match = icmp eq i8* %5, bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) br i1 %work_match, label %.execute.fn, label %.check.next .execute.fn: ; preds = %.execute.parallel - call void @__omp_outlined__1_wrapper() + call void @__omp_outlined__1_wrapper(i16 0, i32 %4) br label %.terminate.parallel .check.next: ; preds = %.execute.parallel %6 = load i8*, i8** %work_fn, align 8 - %work_match1 = icmp eq i8* %6, bitcast (void ()* @__omp_outlined__2_wrapper to i8*) + %work_match1 = icmp eq i8* %6, bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) br i1 %work_match1, label %.execute.fn2, label %.check.next3 .execute.fn2: ; preds = %.check.next - call void @__omp_outlined__2_wrapper() + call void @__omp_outlined__2_wrapper(i16 0, i32 %4) br label %.terminate.parallel .check.next3: ; preds = %.check.next %7 = load i8*, i8** %work_fn, align 8 - %work_match4 = icmp eq i8* %7, bitcast (void ()* @__omp_outlined__3_wrapper to i8*) + %work_match4 = icmp eq i8* %7, bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) br i1 %work_match4, label %.execute.fn5, label %.check.next6 .execute.fn5: ; preds = %.check.next3 - call void @__omp_outlined__3_wrapper() + call void @__omp_outlined__3_wrapper(i16 0, i32 %4) br label %.terminate.parallel .check.next6: ; preds = %.check.next3 - %8 = bitcast i8* %2 to void ()* - call void %8() + %8 = bitcast i8* %2 to void (i16, i32)* + call void %8(i16 0, i32 %4) br label %.terminate.parallel .terminate.parallel: ; preds = %.check.next6, %.execute.fn5, %.execute.fn2, %.execute.fn @@ -103,52 +106,174 @@ ret void } -define weak void @__omp_offloading_35_a1e179_foo_l7() { - call void @__omp_offloading_35_a1e179_foo_l7_worker() - call void @__omp_outlined__() +define weak void @__omp_offloading_50_6dfa0f01_foo_l6() { +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %nvptx_tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %nvptx_num_threads = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + %nvptx_warp_size = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + %thread_limit = sub nuw i32 %nvptx_num_threads, %nvptx_warp_size + %0 = icmp ult i32 %nvptx_tid, %thread_limit + br i1 %0, label %.worker, label %.mastercheck + +.worker: ; preds = %entry + call void @__omp_offloading_50_6dfa0f01_foo_l6_worker() + br label %.exit + +.mastercheck: ; preds = %entry + %nvptx_tid1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %nvptx_num_threads2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + %nvptx_warp_size3 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + %1 = sub nuw i32 %nvptx_warp_size3, 1 + %2 = sub nuw i32 %nvptx_num_threads2, 1 + %3 = xor i32 %1, -1 + %master_tid = and i32 %2, %3 + %4 = icmp eq i32 %nvptx_tid1, %master_tid + br i1 %4, label %.master, label %.exit + +.master: ; preds = %.mastercheck + %nvptx_num_threads4 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + %nvptx_warp_size5 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + %thread_limit6 = sub nuw i32 %nvptx_num_threads4, %nvptx_warp_size5 + call void @__kmpc_kernel_init(i32 %thread_limit6, i16 1) + call void @__kmpc_data_sharing_init_stack() + %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %5, i32* %.threadid_temp., align 4 + call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) + br label %.termination.notifier + +.termination.notifier: ; preds = %.master + call void @__kmpc_kernel_deinit(i16 1) + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) + br label %.exit + +.exit: ; preds = %.termination.notifier, %.mastercheck, %.worker ret void } -define internal void @__omp_outlined__() { - call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void ()* @__omp_outlined__1_wrapper to i8*)) - call void @bar() - call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void ()* @__omp_outlined__3_wrapper to i8*)) +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() + +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + +declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() + +declare void @__kmpc_kernel_init(i32, i16) + +declare void @__kmpc_data_sharing_init_stack() + +define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %captured_vars_addrs = alloca [0 x i8*], align 8 + %captured_vars_addrs1 = alloca [0 x i8*], align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** %2, i64 0) + call void @bar() + %3 = bitcast [0 x i8*]* %captured_vars_addrs1 to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** %3, i64 0) ret void } -define internal void @__omp_outlined__1() { +define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 ret void } -define internal void @__omp_outlined__1_wrapper() { - call void @__omp_outlined__1() +define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr) ret void } -define hidden void @bar() { - call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void ()* @__omp_outlined__2_wrapper to i8*)) +declare void @__kmpc_get_shared_variables(i8***) + +declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) + +define hidden void @bar() { +entry: + %captured_vars_addrs = alloca [0 x i8*], align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** %1, i64 0) ret void } -define internal void @__omp_outlined__2_wrapper() { +define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 ret void } -define internal void @__omp_outlined__3_wrapper() { +define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1) { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__2(i32* %.addr1, i32* %.zero.addr) ret void } -declare void @__kmpc_kernel_prepare_parallel(i8* %WorkFn) +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) -declare zeroext i1 @__kmpc_kernel_parallel(i8** nocapture %WorkFn) +define internal void @__omp_outlined__3(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + ret void +} -declare void @__kmpc_kernel_end_parallel() +define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__3(i32* %.addr1, i32* %.zero.addr) + ret void +} -declare void @__kmpc_barrier_simple_spmd(%struct.ident_t* nocapture readonly %loc_ref, i32 %tid) +declare void @__kmpc_kernel_deinit(i16) -declare i32 @__kmpc_global_thread_num(%struct.ident_t* nocapture readonly) +declare void @__kmpc_barrier_simple_spmd(%struct.ident_t*, i32) + +declare i1 @__kmpc_kernel_parallel(i8**) + +declare void @__kmpc_kernel_end_parallel() -!nvvm.annotations = !{!0} +!nvvm.annotations = !{!1} -!0 = !{void ()* @__omp_offloading_35_a1e179_foo_l7, !"kernel", i32 1} +!1 = !{void ()* @__omp_offloading_50_6dfa0f01_foo_l6, !"kernel", i32 1} diff --git a/openmp/libomptarget/deviceRTLs/common/generated_microtask_cases.gen b/openmp/libomptarget/deviceRTLs/common/generated_microtask_cases.gen new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/generated_microtask_cases.gen @@ -0,0 +1,405 @@ +case 0: +((void (*)(kmp_int32 *, kmp_int32 * +))fn)(&global_tid, &bound_tid +); +break; +case 1: +((void (*)(kmp_int32 *, kmp_int32 * +, void *))fn)(&global_tid, &bound_tid +, args[0]); +break; +case 2: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1]); +break; +case 3: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2]); +break; +case 4: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +); +break; +case 5: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4]); +break; +case 6: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5]); +break; +case 7: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6]); +break; +case 8: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +); +break; +case 9: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8]); +break; +case 10: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9]); +break; +case 11: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10]); +break; +case 12: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +); +break; +case 13: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12]); +break; +case 14: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13]); +break; +case 15: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14]); +break; +case 16: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +); +break; +case 17: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16]); +break; +case 18: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17]); +break; +case 19: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18]); +break; +case 20: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +); +break; +case 21: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20]); +break; +case 22: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21]); +break; +case 23: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22]); +break; +case 24: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22], args[23] +); +break; +case 25: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22], args[23] +, args[24]); +break; +case 26: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22], args[23] +, args[24], args[25]); +break; +case 27: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22], args[23] +, args[24], args[25], args[26]); +break; +case 28: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22], args[23] +, args[24], args[25], args[26], args[27] +); +break; +case 29: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22], args[23] +, args[24], args[25], args[26], args[27] +, args[28]); +break; +case 30: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22], args[23] +, args[24], args[25], args[26], args[27] +, args[28], args[29]); +break; +case 31: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22], args[23] +, args[24], args[25], args[26], args[27] +, args[28], args[29], args[30]); +break; +case 32: +((void (*)(kmp_int32 *, kmp_int32 * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22], args[23] +, args[24], args[25], args[26], args[27] +, args[28], args[29], args[30], args[31] +); +break; \ No newline at end of file diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -88,11 +88,6 @@ int threadId = GetThreadIdInBlock(); if (threadId == 0) { usedSlotIdx = __kmpc_impl_smid() % MAX_SM; - parallelLevel[0] = - 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); - } else if (GetLaneId() == 0) { - parallelLevel[GetWarpId()] = - 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); } if (!RequiresOMPRuntime) { // Runtime is not required - exit. diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu --- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu @@ -154,16 +154,6 @@ (int)newTaskDescr->ThreadId(), (int)nThreads); isActive = true; - // Reconverge the threads at the end of the parallel region to correctly - // handle parallel levels. - // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole - // warp. If only 1 thread is active, not need to reconverge the threads. - // If we have the whole warp, reconverge all the threads in the warp before - // actually trying to change the parallel level. Otherwise, parallel level - // can be changed incorrectly because of threads divergence. - bool IsActiveParallelRegion = threadsInTeam != 1; - IncParallelLevel(IsActiveParallelRegion, - IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u); } return isActive; @@ -180,17 +170,6 @@ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( threadId, currTaskDescr->GetPrevTaskDescr()); - - // Reconverge the threads at the end of the parallel region to correctly - // handle parallel levels. - // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole - // warp. If only 1 thread is active, not need to reconverge the threads. - // If we have the whole warp, reconverge all the threads in the warp before - // actually trying to change the parallel level. Otherwise, parallel level can - // be changed incorrectly because of threads divergence. - bool IsActiveParallelRegion = threadsInTeam != 1; - DecParallelLevel(IsActiveParallelRegion, - IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u); } //////////////////////////////////////////////////////////////////////////////// @@ -301,4 +280,90 @@ PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind); } +//////////////////////////////////////////////////////////////////////////////// +// parallel interface +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_parallel_51(kmp_Ident *ident, kmp_int32 global_tid, + kmp_int32 if_expr, kmp_int32 num_threads, + int proc_bind, void *fn, void *wrapper_fn, + void **args, size_t nargs) { + + // Handle the serialized case first, same for SPMD/non-SPMD. + // TODO: Add UNLIKELY to optimize? + bool InParallelRegion = (__kmpc_parallel_level(ident, global_tid) > 0); + if (!if_expr || InParallelRegion) { + __kmpc_serialized_parallel(ident, global_tid); + __kmp_invoke_microtask(global_tid, 0, fn, args, nargs); + __kmpc_end_serialized_parallel(ident, global_tid); + + return; + } + + if (__kmpc_is_spmd_exec_mode()) { + // Increment parallel level for SPMD warps. + if (GetLaneId() == 0) + parallelLevel[GetWarpId()] = + 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); + // TODO: Is that synchronization correct/needed? Can only using a memory + // fence ensure consistency? + __kmpc_impl_syncthreads(); + + __kmp_invoke_microtask(global_tid, 0, fn, args, nargs); + + // Decrement (zero out) parallel level for SPMD warps. + if (GetLaneId() == 0) + parallelLevel[GetWarpId()] = 0; + return; + } + + // Handle the num_threads clause. + if (num_threads != -1) + __kmpc_push_num_threads(ident, global_tid, num_threads); + + __kmpc_kernel_prepare_parallel((void *)wrapper_fn); + + if (nargs) { + void **GlobalArgs; + __kmpc_begin_sharing_variables(&GlobalArgs, nargs); + // TODO: faster memcpy? + for (int I = 0; I < nargs; I++) + GlobalArgs[I] = args[I]; + } + + // TODO: what if that's a parallel region with a single thread? this is + // considered not active in the existing implementation. + bool IsActiveParallelRegion = threadsInTeam != 1; + int NumWarps = + threadsInTeam / WARPSIZE + ((threadsInTeam % WARPSIZE) ? 1 : 0); + // Increment parallel level for non-SPMD warps. + for (int I = 0; I < NumWarps; ++I) + parallelLevel[I] += + (1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); + + // Master signals work to activate workers. + __kmpc_barrier_simple_spmd(nullptr, 0); + + // OpenMP [2.5, Parallel Construct, p.49] + // There is an implied barrier at the end of a parallel region. After the + // end of a parallel region, only the master thread of the team resumes + // execution of the enclosing task region. + // + // The master waits at this barrier until all workers are done. + __kmpc_barrier_simple_spmd(nullptr, 0); + + // Decrement parallel level for non-SPMD warps. + for (int I = 0; I < NumWarps; ++I) + parallelLevel[I] -= + (1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); + // TODO: Is synchronization needed since out of parallel execution? + + if (nargs) + __kmpc_end_sharing_variables(); + + // TODO: proc_bind is a noop? + // if (proc_bind != proc_bind_default) + // __kmpc_push_proc_bind(ident, global_tid, proc_bind); +} + #pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu --- a/openmp/libomptarget/deviceRTLs/common/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu @@ -265,4 +265,16 @@ return static_cast(ReductionScratchpadPtr) + 256; } +// Invoke an outlined parallel function unwrapping arguments (up +// to 32). +void __kmp_invoke_microtask(kmp_int32 global_tid, kmp_int32 bound_tid, void *fn, + void **args, size_t nargs) { + switch (nargs) { +#include "common/generated_microtask_cases.gen" + default: + printf("Too many arguments in kmp_invoke_microtask, aborting execution.\n"); + __builtin_trap(); + } +} + #pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h --- a/openmp/libomptarget/deviceRTLs/common/support.h +++ b/openmp/libomptarget/deviceRTLs/common/support.h @@ -95,4 +95,9 @@ unsigned int *GetTeamsReductionTimestamp(); char *GetTeamsReductionScratchpad(); +// Invoke an outlined parallel function unwrapping global, shared arguments (up +// to 128). +void __kmp_invoke_microtask(kmp_int32 global_tid, kmp_int32 bound_tid, void *fn, + void **args, size_t nargs); + #endif diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h --- a/openmp/libomptarget/deviceRTLs/interface.h +++ b/openmp/libomptarget/deviceRTLs/interface.h @@ -177,6 +177,7 @@ * The struct is identical to the one in the kmp.h file. * We maintain the same data structure for compatibility. */ +typedef short kmp_int16; typedef int kmp_int32; typedef struct ident { kmp_int32 reserved_1; /**< might be used in Fortran; see above */ @@ -435,6 +436,22 @@ EXTERN void __kmpc_end_sharing_variables(); EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs); +/// Entry point to start a new parallel region. +/// +/// \param ident The source identifier. +/// \param global_tid The global thread ID. +/// \param if_expr The if(expr), or 1 if none given. +/// \param num_threads The num_threads(expr), or -1 if none given. +/// \param proc_bind The proc_bind, or `proc_bind_default` if none given. +/// \param fn The outlined parallel region function. +/// \param wrapper_fn The worker wrapper function of fn. +/// \param args The pointer array of arguments to fn. +/// \param nargs The number of arguments to fn. +EXTERN void __kmpc_parallel_51(ident_t *ident, kmp_int32 global_tid, + kmp_int32 if_expr, kmp_int32 num_threads, + int proc_bind, void *fn, void *wrapper_fn, + void **args, size_t nargs); + // SPMD execution mode interrogation function. EXTERN int8_t __kmpc_is_spmd_exec_mode(); diff --git a/openmp/libomptarget/test/offloading/bug49779.cpp b/openmp/libomptarget/test/offloading/bug49779.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/offloading/bug49779.cpp @@ -0,0 +1,36 @@ +// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda + +#include +#include + +void work(int *C) { +#pragma omp atomic + ++(*C); +} + +void use(int *C) { +#pragma omp parallel num_threads(2) + work(C); +} + +int main() { + int C = 0; +#pragma omp target map(C) + { + use(&C); +#pragma omp parallel num_threads(2) + use(&C); + } + + assert(C >= 2 && C <= 6); + + std::cout << "PASS\n"; + + return 0; +} + +// CHECK: PASS diff --git a/openmp/libomptarget/utils/generate_microtask_cases.py b/openmp/libomptarget/utils/generate_microtask_cases.py new file mode 100755 --- /dev/null +++ b/openmp/libomptarget/utils/generate_microtask_cases.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 + +import argparse + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--max_args', type=int, help='Max number of arguments to generate case statements for', required=True) + parser.add_argument('--output', help='Output header file to include', required=True) + args = parser.parse_args() + + output='' + for i in range(args.max_args+1): + output += 'case %d:\n'%(i) + output += '((void (*)(kmp_int32 *, kmp_int32 *\n' + for j in range(i): + output += ', void *' + if (j+1)%4 == 0: + output += '\n' + output += '))fn)(&global_tid, &bound_tid\n' + for j in range(i): + output += ', args[%d]'%(j) + if (j+1)%4 == 0: + output += '\n' + output += ');\n' + output += 'break;\n' + + with open(args.output, 'w') as f: + print(output, file=f) + +if __name__ == "__main__": + main()